Visualize atom weight of AttentiveFP #DGL #RDKit #Chemoinformatics

Yesterday, I posted an example of DGL (almost same as original example code).

And I could make regression model with my own dataset. Fortunately DGL developer provides a code for visualize atom weights of trained model.

It means that, after building the model with AttentiveFP, you can visualize atom weight of the give molecule which means how much each atom contribute to the target value.

I saw many example of the approach but never tried it by myself. So I tried to it today.

Following code is same as yesterday.

%matplotlib inline 
import matplotlib.pyplot as plt
import os
from rdkit import Chem
from rdkit import RDPaths

import dgl
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from dgl import model_zoo

from dgl.data.chem.utils import mol_to_complete_graph, mol_to_bigraph

from dgl.data.chem.utils import atom_type_one_hot
from dgl.data.chem.utils import atom_degree_one_hot
from dgl.data.chem.utils import atom_formal_charge
from dgl.data.chem.utils import atom_num_radical_electrons
from dgl.data.chem.utils import atom_hybridization_one_hot
from dgl.data.chem.utils import atom_total_num_H_one_hot
from dgl.data.chem.utils import one_hot_encoding
from dgl.data.chem import CanonicalAtomFeaturizer
from dgl.data.chem import CanonicalBondFeaturizer
from dgl.data.chem import ConcatFeaturizer
from dgl.data.chem import BaseAtomFeaturizer
from dgl.data.chem import BaseBondFeaturizer

from dgl.data.chem import one_hot_encoding
from dgl.data.utils import split_dataset

from functools import partial
from sklearn.metrics import roc_auc_score
def chirality(atom):
    try:
        return one_hot_encoding(atom.GetProp('_CIPCode'), ['R', 'S']) + \
               [atom.HasProp('_ChiralityPossible')]
    except:
        return [False, False] + [atom.HasProp('_ChiralityPossible')]
    
def collate_molgraphs(data):
    """Batching a list of datapoints for dataloader.
    Parameters
    ----------
    data : list of 3-tuples or 4-tuples.
        Each tuple is for a single datapoint, consisting of
        a SMILES, a DGLGraph, all-task labels and optionally
        a binary mask indicating the existence of labels.
    Returns
    -------
    smiles : list
        List of smiles
    bg : BatchedDGLGraph
        Batched DGLGraphs
    labels : Tensor of dtype float32 and shape (B, T)
        Batched datapoint labels. B is len(data) and
        T is the number of total tasks.
    masks : Tensor of dtype float32 and shape (B, T)
        Batched datapoint binary mask, indicating the
        existence of labels. If binary masks are not
        provided, return a tensor with ones.
    """
    assert len(data[0]) in [3, 4], \
        'Expect the tuple to be of length 3 or 4, got {:d}'.format(len(data[0]))
    if len(data[0]) == 3:
        smiles, graphs, labels = map(list, zip(*data))
        masks = None
    else:
        smiles, graphs, labels, masks = map(list, zip(*data))

    bg = dgl.batch(graphs)
    bg.set_n_initializer(dgl.init.zero_initializer)
    bg.set_e_initializer(dgl.init.zero_initializer)
    labels = torch.stack(labels, dim=0)
    
    if masks is None:
        masks = torch.ones(labels.shape)
    else:
        masks = torch.stack(masks, dim=0)
    return smiles, bg, labels, masks

atom_featurizer = BaseAtomFeaturizer(
                 {'hv': ConcatFeaturizer([
                  partial(atom_type_one_hot, allowable_set=[
                          'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te', 'I', 'At'],
                    encode_unknown=True),
                  partial(atom_degree_one_hot, allowable_set=list(range(6))),
                  atom_formal_charge, atom_num_radical_electrons,
                  partial(atom_hybridization_one_hot, encode_unknown=True),
                  lambda atom: [0], # A placeholder for aromatic information,
                    atom_total_num_H_one_hot, chirality
                 ],
                )})
bond_featurizer = BaseBondFeaturizer({
                                     'he': lambda bond: [0 for _ in range(10)]
    })

train=os.path.join(RDPaths.RDDocsDir, 'Book/data/solubility.train.sdf')
test=os.path.join(RDPaths.RDDocsDir, 'Book/data/solubility.test.sdf')

train_graph =[mol_to_bigraph(mol,
                           atom_featurizer=atom_featurizer, 
                           bond_featurizer=bond_featurizer) for mol in train_mols]

test_graph =[mol_to_bigraph(mol,
                           atom_featurizer=atom_featurizer, 
                           bond_featurizer=bond_featurizer) for mol in test_mols]

def run_a_train_epoch(n_epochs, epoch, model, data_loader,loss_criterion, optimizer):
    model.train()
    total_loss = 0
    losses = []
    
    for batch_id, batch_data in enumerate(data_loader):
        batch_data
        smiles, bg, labels, masks = batch_data
        if torch.cuda.is_available():
            bg.to(torch.device('cuda:0'))
            labels = labels.to('cuda:0')
            masks = masks.to('cuda:0')
        
        prediction = model(bg, bg.ndata['hv'], bg.edata['he'])
        loss = (loss_criterion(prediction, labels)*(masks != 0).float()).mean()
        #loss = loss_criterion(prediction, labels)
        #print(loss.shape)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.data.item())
        
    #total_score = np.mean(train_meter.compute_metric('rmse'))
    total_score = np.mean(losses)
    print('epoch {:d}/{:d}, training {:.4f}'.format( epoch + 1, n_epochs,  total_score))
    return total_score

model = model_zoo.chem.AttentiveFP(node_feat_size=39,
                                  edge_feat_size=10,
                                  num_layers=2,
                                  num_timesteps=2,
                                  graph_feat_size=200,
                                  output_size=1,
                                  dropout=0.2)
model = model.to('cuda:0')

train_loader = DataLoader(dataset=list(zip(train_smi, train_graph, train_sol)), batch_size=128, collate_fn=collate_molgraphs)
test_loader = DataLoader(dataset=list(zip(test_smi, test_graph, test_sol)), batch_size=128, collate_fn=collate_molgraphs)

loss_fn = nn.MSELoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=10 ** (-2.5), weight_decay=10 ** (-5.0),)
n_epochs = 100
epochs = []
scores = []
for e in range(n_epochs):
    score = run_a_train_epoch(n_epochs, e, model, train_loader, loss_fn, optimizer)
    epochs.append(e)
    scores.append(score)
model.eval()

OK, I build the predictive model. (of course build model can save with torch.save(model.state_dict(), PATH) method.)

Let’s visualize molecule with atom weights!

At first, import packages for molecule visualization.

import copy
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
from IPython.display import SVG
from IPython.display import display
import matplotlib
import matplotlib.cm as cm

Then define visualization function. Following code is borrowed from original repository thanks a lot. DGL model has get_node_weight option, which returns node_weight of the graph. The model has two layers of GRU so timestep must be 0 or 1 following code I used 0 as timestep.

def drawmol(idx, dataset, timestep):
    smiles, graph, _ = dataset[idx]
    print(smiles)
    bg = dgl.batch([graph])
    atom_feats, bond_feats = bg.ndata['hv'], bg.edata['he']
    if torch.cuda.is_available():
        print('use cuda')
        bg.to(torch.device('cuda:0'))
        atom_feats = atom_feats.to('cuda:0')
        bond_feats = bond_feats.to('cuda:0')
    
    _, atom_weights = model(bg, atom_feats, bond_feats, get_node_weight=True)
    assert timestep < len(atom_weights), 'Unexpected id for the readout round'
    atom_weights = atom_weights[timestep]
    min_value = torch.min(atom_weights)
    max_value = torch.max(atom_weights)
    atom_weights = (atom_weights - min_value) / (max_value - min_value)
    
    norm = matplotlib.colors.Normalize(vmin=0, vmax=1.28)
    cmap = cm.get_cmap('bwr')
    plt_colors = cm.ScalarMappable(norm=norm, cmap=cmap)
    atom_colors = {i: plt_colors.to_rgba(atom_weights[i].data.item()) for i in range(bg.number_of_nodes())}

    mol = Chem.MolFromSmiles(smiles)
    rdDepictor.Compute2DCoords(mol)
    drawer = rdMolDraw2D.MolDraw2DSVG(280, 280)
    drawer.SetFontSize(1)
    op = drawer.drawOptions()
    
    mol = rdMolDraw2D.PrepareMolForDrawing(mol)
    drawer.DrawMolecule(mol, highlightAtoms=range(bg.number_of_nodes()),
                             highlightBonds=[],
                             highlightAtomColors=atom_colors)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    svg = svg.replace('svg:', '')
    if torch.cuda.is_available():
        atom_weights = atom_weights.to('cpu')
    return (Chem.MolFromSmiles(smiles), atom_weights.data.numpy(), svg)

Draw test dataset molecules. The model predicts solubility and color indicates that red is positive effect for solubility and blue is negative impact.

target = test_loader.dataset
for i in range(len(target)):
    mol, aw, svg = drawmol(i, target, 0)
    display(SVG(svg))

For personally, hydroxyl group has positive effect for solubility I think but the model shows it is not always true. Hmm my code is something wrong? Or I need to think about more details of the model?
I would like to try more predictive task and write helper code for DGL’s AttentiveFP for convenient way to molecular visualization and model building.

Today’s whole code is uploaded below.
https://gist.github.com/iwatobipen/72a2d9dd616322f1f20469a152f2bb58

Any comments or suggestions will be highly appreciated. ;)

Molecular property regression with Attentive FP #RDKit #Chemoinformatics #DGL #DeepGraphLibrary

Recently Molecular Graph based deep learning is hot are in chemoinformatics.
Some months ago, Zhaoping et al. published new graph based QSAR model named ‘Attentive FP’ in JMC.

As its name suggests, Attentive FP uses attention mechanism for its architecture.

The authors disclosed their code. And fortunately, recent version of DGL is also Attentive FP!
Its repository provides an example of molecular property regression with attentive fp. However it is difficult to understand if I would like to use the FP against my dataset.
So I updated DGL and tried to use attentive FP. In the following code I used solubility data that is provided from rdkit for my practice.

First, import several packages for deep learning. DGL has many function for chemoinformatics task. Used doesn’t need implement functions which are required for chemo to graph conversion.

%matplotlib inline 
import matplotlib.pyplot as plt
import os
from rdkit import Chem
from rdkit import RDPaths

import dgl
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from dgl import model_zoo

from dgl.data.chem.utils import mol_to_complete_graph, mol_to_bigraph

from dgl.data.chem.utils import atom_type_one_hot
from dgl.data.chem.utils import atom_degree_one_hot
from dgl.data.chem.utils import atom_formal_charge
from dgl.data.chem.utils import atom_num_radical_electrons
from dgl.data.chem.utils import atom_hybridization_one_hot
from dgl.data.chem.utils import atom_total_num_H_one_hot
from dgl.data.chem.utils import one_hot_encoding
from dgl.data.chem import CanonicalAtomFeaturizer
from dgl.data.chem import CanonicalBondFeaturizer
from dgl.data.chem import ConcatFeaturizer
from dgl.data.chem import BaseAtomFeaturizer
from dgl.data.chem import BaseBondFeaturizer

from dgl.data.chem import one_hot_encoding
from dgl.data.utils import split_dataset

from functools import partial
from sklearn.metrics import roc_auc_score

Then I defined some helper function for the task. Almost of the codes are borrowed from original dgl/example. Thanks for sharing the nice code!

def chirality(atom):
    try:
        return one_hot_encoding(atom.GetProp('_CIPCode'), ['R', 'S']) + \
               [atom.HasProp('_ChiralityPossible')]
    except:
        return [False, False] + [atom.HasProp('_ChiralityPossible')]
    
def collate_molgraphs(data):
    """Batching a list of datapoints for dataloader.
    Parameters
    ----------
    data : list of 3-tuples or 4-tuples.
        Each tuple is for a single datapoint, consisting of
        a SMILES, a DGLGraph, all-task labels and optionally
        a binary mask indicating the existence of labels.
    Returns
    -------
    smiles : list
        List of smiles
    bg : BatchedDGLGraph
        Batched DGLGraphs
    labels : Tensor of dtype float32 and shape (B, T)
        Batched datapoint labels. B is len(data) and
        T is the number of total tasks.
    masks : Tensor of dtype float32 and shape (B, T)
        Batched datapoint binary mask, indicating the
        existence of labels. If binary masks are not
        provided, return a tensor with ones.
    """
    assert len(data[0]) in [3, 4], \
        'Expect the tuple to be of length 3 or 4, got {:d}'.format(len(data[0]))
    if len(data[0]) == 3:
        smiles, graphs, labels = map(list, zip(*data))
        masks = None
    else:
        smiles, graphs, labels, masks = map(list, zip(*data))

    bg = dgl.batch(graphs)
    bg.set_n_initializer(dgl.init.zero_initializer)
    bg.set_e_initializer(dgl.init.zero_initializer)
    labels = torch.stack(labels, dim=0)
    
    if masks is None:
        masks = torch.ones(labels.shape)
    else:
        masks = torch.stack(masks, dim=0)
    return smiles, bg, labels, masks


def run_a_train_epoch(n_epochs, epoch, model, data_loader,loss_criterion, optimizer):
    model.train()
    total_loss = 0
    losses = []
    
    for batch_id, batch_data in enumerate(data_loader):
        batch_data
        smiles, bg, labels, masks = batch_data
        if torch.cuda.is_available():
            bg.to(torch.device('cuda:0'))
            labels = labels.to('cuda:0')
            masks = masks.to('cuda:0')
        
        prediction = model(bg, bg.ndata['hv'], bg.edata['he'])
        loss = (loss_criterion(prediction, labels)*(masks != 0).float()).mean()
        #loss = loss_criterion(prediction, labels)
        #print(loss.shape)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        losses.append(loss.data.item())
        

    total_score = np.mean(losses)
    print('epoch {:d}/{:d}, training {:.4f}'.format( epoch + 1, n_epochs,  total_score))
    return total_score

After that, I defined atom and bond featurizer functions. Their settings are same as original repository but it is easy to modify the featurizer.

atom_featurizer = BaseAtomFeaturizer(
                 {'hv': ConcatFeaturizer([
                  partial(atom_type_one_hot, allowable_set=[
                          'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'As', 'Se', 'Br', 'Te', 'I', 'At'],
                    encode_unknown=True),
                  partial(atom_degree_one_hot, allowable_set=list(range(6))),
                  atom_formal_charge, atom_num_radical_electrons,
                  partial(atom_hybridization_one_hot, encode_unknown=True),
                  lambda atom: [0], # A placeholder for aromatic information,
                    atom_total_num_H_one_hot, chirality
                 ],
                )})
bond_featurizer = BaseBondFeaturizer({
                                     'he': lambda bond: [0 for _ in range(10)]
    })

If you would like to the featurizer as same as DeepChem, you can use CanonicalAtom/BondFeaturizer.
https://docs.dgl.ai/en/latest/generated/dgl.data.chem.CanonicalAtomFeaturizer.html
https://docs.dgl.ai/en/latest/generated/dgl.data.chem.CanonicalBondFeaturizer.html

DGL seems friendly for chemoinformatitian I think.

OK, let’s load dataset. mol_to_bigraph method with featurizer converts rdkit mol object to graph object. Also, smiles_to_bigraph method can convert smiles to graph! Cool ;)

train=os.path.join(RDPaths.RDDocsDir, 'Book/data/solubility.train.sdf')
test=os.path.join(RDPaths.RDDocsDir, 'Book/data/solubility.test.sdf')

train_mols = Chem.SDMolSupplier(train)
train_smi =[Chem.MolToSmiles(m) for m in train_mols]
train_sol = torch.tensor([float(mol.GetProp('SOL')) for mol in train_mols]).reshape(-1,1)

test_mols =  Chem.SDMolSupplier(test)
test_smi = [Chem.MolToSmiles(m) for m in test_mols]
test_sol = torch.tensor([float(mol.GetProp('SOL')) for mol in test_mols]).reshape(-1,1)


train_graph =[mol_to_bigraph(mol,
                           atom_featurizer=atom_featurizer, 
                           bond_featurizer=bond_featurizer) for mol in train_mols]

test_graph =[mol_to_bigraph(mol,
                           atom_featurizer=atom_featurizer, 
                           bond_featurizer=bond_featurizer) for mol in test_mols]

AttentivFp model is provided from model_zoo. And define dataloader for training and test.

model = model_zoo.chem.AttentiveFP(node_feat_size=39,
                                  edge_feat_size=10,
                                  num_layers=2,
                                  num_timesteps=2,
                                  graph_feat_size=200,
                                  output_size=1,
                                  dropout=0.2)
model = model.to('cuda:0')

train_loader = DataLoader(dataset=list(zip(train_smi, train_graph, train_sol)), batch_size=128, collate_fn=collate_molgraphs)
test_loader = DataLoader(dataset=list(zip(test_smi, test_graph, test_sol)), batch_size=128, collate_fn=collate_molgraphs)

model = model_zoo.chem.AttentiveFP(node_feat_size=39,
edge_feat_size=10,
num_layers=2,
num_timesteps=2,
graph_feat_size=200,
output_size=1,
dropout=0.2)
model = model.to('cuda:0')

train_loader = DataLoader(dataset=list(zip(train_smi, train_graph, train_sol)), batch_size=128, collate_fn=collate_molgraphs)
test_loader = DataLoader(dataset=list(zip(test_smi, test_graph, test_sol)), batch_size=128, collate_fn=collate_molgraphs)

Dataloader is pytorch native class. It generates iterator of butch of dataset.
Now almost there! Let’s go to learning process.

loss_fn = nn.MSELoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(), lr=10 ** (-2.5), weight_decay=10 ** (-5.0),)
n_epochs = 100
epochs = []
scores = []
for e in range(n_epochs):
    score = run_a_train_epoch(n_epochs, e, model, train_loader, loss_fn, optimizer)
    epochs.append(e)
    scores.append(score)

>>>output is below.
epoch 1/100, training 8.8096
----snip---
epoch 98/100, training 0.3706
epoch 99/100, training 0.3915
epoch 100/100, training 0.3003
plt.plot(epochs, scores)

It seems that learning process goes well ;).

OK let’s validate the model!

model.eval()
all_pred = []
for test_data in test_loader:
    smi_lst, bg, labels, masks = test_data
    if torch.cuda.is_available():
            bg.to(torch.device('cuda:0'))
            labels = labels.to('cuda:0')
            masks = masks.to('cuda:0')
    pred = model(bg, bg.ndata['hv'], bg.edata['he'])
    all_pred.append(pred.data.cpu().numpy())
res = np.vstack(all_pred)
plt.clf()
plt.scatter(res, test_sol)
plt.xlabel('pred')
plt.ylabel('exp')
from sklearn.metrics import r2_score
print(r2_score(test_sol, res))
> 0.9098691301661277

Let’s compare to RandomForest.

from sklearn.ensemble import RandomForestRegressor
from rdkit import Chem
from rdkit.Chem import AllChem
train_fp = [AllChem.GetMorganFingerprintAsBitVect(mol,2) for mol in train_mols]
test_fp = [AllChem.GetMorganFingerprintAsBitVect(mol,2) for mol in test_mols]
# make RF regressor and train it.
rfr = RandomForestRegressor()
rfr.fit(train_fp, train_sol)

Check the performance.

rfr_pred = rfr.predict(test_fp)
r2_score(test_sol, rfr_pred)
plt.clf()
plt.scatter(rfr_pred, test_sol)

AttentiveFP model showed high performance for solubility prediction in this case.(The my code of RandomForest is not optimized.) DGL example code is very useful for beginner of DGL but it is difficult to apply to my own dataset. So I need to rewrite the code with my dataset.

Any way, I would like to buy Beer to DGL developper. DGL is very nice package for chemoinformatics and ‘RDKitds’. RDKits is new nickname of rdkit user, it is proposed in RDKit UGM 2019 ;)
Today’s code is below.

my whole code.

Thanks,

Deoxofluorination of carboxylic acid #organic_chemistry

Now I moved from medchem team to comp.chem team but still have interest against organic chemistry.

Today I found interesting article in JOC.

The article describes about method for synthesizing of trifluoro BB with SF4.
https://pubs.acs.org/doi/10.1021/acs.joc.9b02596

The elegant work was published from researcher of ENAMINE.

As you know enamine has huge amount of unique building blocks for Drug Discovery. Several years ago I had opportunity to visit Enamine. It was well organized and there are many high skilled researchers.

OK, back to the article.

The author found suitable reaction condition for converting aliphatic carboxylic acid to trifluoro methyl group.

And the important thing is that the reaction proceed mild condition (55 degree), keep substrate stereo chemistry, wide functional group tolerance with good yield.

The key point is that small amount of water addition. In the revious method, liquid HF was used for reaction solvent. HF is highly toxic so it is not suitable solvent for common labo work I think.

Of course SF4 is also toxic reagent. To handle the reagent, it is required safety facility.

However in Scheme 4 shows that wide range of applicability of the reaction condition.

And from experimental section, the reaction can conduct gram scale.

It is very useful for medicinal chemistry I think. New building block synthesis is very exiting work for me.

RDKit based CATS Descriptor #RDKit #Chemoinformatics

Chemically Advanced Template Search (CATS) is developed by Prof. Gisbert Schneider. CATS descriptor is ligand pharmacophore based fuzzy descriptor. So it is suitable for Scaffold hopping of virtual screening. Last week, I attended his lecture and had interest the descriptor again. Fortunately I could find some implementation of the CATS2D descriptor in github repo.

Arthuc’s work (original work is Rajarshi Guha) is nice because the code is written with python and rdkit is used in chemical engine.

I modified the work and made package for CATS2D descriptor calculation with RDKit.

Let’s see an example for distance calculation. At first import packages for calculation.

from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from scipy.spatial.distance import euclidean
from cats2d.rd_cats2d import CATS2D
import numpy as np

from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

Then load two test molecules. These molecules are well known drugs.

mol1 = Chem.MolFromSmiles('CCCC1=NN(C2=C1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)C)OCC)C')
mol2 = Chem.MolFromSmiles('CCCC1=NC(=C2N1N=C(NC2=O)C3=C(C=CC(=C3)S(=O)(=O)N4CCN(CC4)CC)OCC)C')

OK let’s calculate distance of these molecules with Morgan FP and CASTS2D descriptors. CATS2D descriptor is not bit fingerprint, so I used eucdean distance. Surprisingly Tanimoto distance (1.0 – Tanimoto similarity) is very low even if these molecules looks similar.

fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
dist = 1.0 - DataStructs.TanimotoSimilarity(fp1, fp2)
print(dist)
> 0.41025641025641024

On the other hand, cats2d descriptor based distance is 0.0. It indicates that the two molecules are almost same based on their pharmacophore features.

cats = CATS2D()
cats1 = cats.getCATs2D(mol1)
cats2 = cats.getCATs2D(mol2)
euclidean(cats1, cats2)
> 0.0

Also the package can provide information of pharmacophore.

print(cats.getPcoreGroups(mol1))
>
['', ['L'], '', '', ['A'], '', '', '', ['A'], '', ['D'], '', ['A'], '', '', '', '', '', '', '', ['A'], ['A'], ['A'], '', '', ['A'], '', '', '', ['A'], '', '', '']

print(cats.getPcoreGroups(mol2))
>
['', ['L'], '', '', ['A'], '', '', '', ['A'], '', ['D'], '', ['A'], '', '', '', '', '', '', '', ['A'], ['A'], ['A'], '', '', ['A'], '', '', '', '', ['A'], '', '', '']

Scaffold hopping is very useful strategy of drug discovery for not only improving compound properties but also expanding IP space.

I would like to improve the package because the package is still under development.

Any comments a/o suggestions are greatly appreciated.

My code can be found following URL.
https://github.com/iwatobipen/CATS2D

Thanks for developing and sharing CATS2D descriptor implementation!

Python package for Ensemble learning #Chemoinformatics #Scikit learn

Ensemble learning is a technique for machine learning. I wrote post about blending learning before. URL is below.
https://iwatobipen.wordpress.com/2018/11/11/ensemble-learning-with-scikit-learn-and-xgboost-machine-learning/
I implemented the code by myself at that time.

Ensemble learning sometime outperform than single model. So it is useful for try to use the method. Fortunately now we can use ensemble learning very easily by using a python package named ‘ML-Ens‘ Installation is very easy, only use pip command common way for pythonista I think ;)

After installing the package user can build and train ensemble learning model with few lines. I would like to introduce two example of them one is stacking method and the other is a blending method. OK let’s go to code.

At first, load dataset and make input features. I used morgan fingerprint as input data.

from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit import RDPaths
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
import numpy as np
import pandas as pd
from IPython.display import HTML
traindf = PandasTools.LoadSDF(os.path.join(RDPaths.RDDocsDir,'Book/data/solubility.train.sdf'))
testdf = PandasTools.LoadSDF(os.path.join(RDPaths.RDDocsDir, 'Book/data/solubility.test.sdf'))
# Chek data
HTML(traindf.head(2).to_html())

cls2lab = {'(A) low':0, '(B) medium':1, '(C) high':2}

def fp2np(fp):
    arr = np.zeros((0,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr
trainfp = [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in traindf.ROMol]
testfp =  [AllChem.GetMorganFingerprintAsBitVect(m, 2) for m in testdf.ROMol]
trainX = np.array([fp2np(fp) for fp in trainfp])
testX = np.array([fp2np(fp) for fp in testfp])
trainY = np.array([cls2lab[i] for i in traindf.SOL_classification.to_list()])
testY =  np.array([cls2lab[i] for i in testdf.SOL_classification.to_list()])

Then import several package for ensemble learning. SuperLearner is class for stacking and BlendEnsemble is class for blending.

Making ensemble model is easy. Just use add method to layer addition and finally call add_meta method for adding final prediction layer.

from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, accuracy_score
from sklearn.svm import SVR, SVC

# For stacnking
ensemble = SuperLearner(scorer=accuracy_score, random_state=794, verbose=2)
ensemble.add([RandomForestClassifier(n_estimators=100, random_state=794), SVC(gamma='auto', C=1000)])
ensemble.add_meta(LogisticRegression(solver='lbfgs', multi_class='auto'))

ensemble.fit(trainX, trainY)
pred = ensemble.predict(testX)
accuracy_score(testY, pred)

# Blending
from mlens.ensemble import BlendEnsemble
ensemble2 = BlendEnsemble(scorer=accuracy_score, test_size=0.2, verbose=2)
ensemble2.add([RandomForestClassifier(n_estimators=794, random_state=794),
                 SVC(gamma='auto')])
ensemble2.add_meta(LogisticRegression(solver='lbfgs', multi_class='auto'))
ensemble2.fit(trainX, trainY)
pred_b = ensemble2.predict(testX)
accuracy_score(pred_b, testY)

Also more models can added with add method. I uploaded whole code on my gist. After calling fit, it is easy to access result data by using data method.

code example

Unfortunately the ensemble models described in the post don’t outperform single random forest model but mlens is nice tool for ensemble learning there is still room of improvement for model performance such as kind of models, hyper parameters etc.

Original document give more informations. Please go to link if reader has interest.
http://ml-ensemble.com/docs/

Quantum Chemistry data of drug bank #QCportal #Quantum_Chemistry

I’m still learning QCArchive. I posted qcportal with reaction dataset. And today I tried to retrieve of drug bank from qcportal. QCportal provides not only calculated numeric data but also 3D mol view by using py3Dmol.

OK let’s go to code. get_molecule method provides many data from qcportal web server.

import qcportal as ptl
client = ptl.FractalClient()
ds = client.get_collection("Dataset", "COMP6 DrugBank")
mols = ds.get_molecules()
mols.shape
> (13379, 1)

What kinds of data in the dataset? It is easy to do it, just call some methods.

ds.list_values().reset_index()['method'].unique()
> array(['ωB97x', 'b3lyp', 'b3lyp-d3m(bj)', 'hf', 'pbe', 'pbe-d3m(bj)',
       'svwn', 'wb97m', 'wb97m-d3(bj)'], dtype=object)
ds.list_values().reset_index()['basis'].unique()
> array(['6-31g*', 'def2-tzvp'], dtype=object)

ds.list_values()

This dataset has not only data from psi4 but also gaussian!.

I got data from method=’wB97x’

val = ds.get_values(method='ωB97x')
val.columns
> Index(['CM5 Charges', 'Hirshfeld Charges', 'Energy', 'Gradient',
       'Hirshfeld Dipole', 'Spin Density'],
      dtype='object')

I got energy from the data and visualize molecules.

energy = val['Energy']
mols['molecule'][0].show()
energy[0]
> -636107.9519541461

Py3Dmol works very well. I could get QC energy of molecule in drug bank and could render molecule as 3D object.

It is very cool!

My whole code is uploaded following URL.

Have a nice week end! ;)

https://nbviewer.jupyter.org/github/iwatobipen/playground/blob/master/drug_bank.ipynb

Open data source of Quantum chemistry! #qcportal #rdkit #cheminformatics #quantum_chemisry

In RDKit UGM 2019, I had interest about QCArchive. QCArchive is MolSSI quantum chemistry archive. It provides useful data and python packages.

By using one package named qcportal, we can access huge data source of quantum chemistry. It is very useful because QC calculation is useful but it requires computational cost. QC data is useful for drug design and machine learning (i.e. building machine learning based force field etc…..).

I used the package. At first I installed qcportal via conda in my env. It isn’t good choice because I couldn’t install new version of the package. Old version of qcportal causes error. So I installed via pip. It worked fine.

Following code is almost same as original document. But I tried it for my memorandum. At first import packages and make client object. I used datasource from MolSSI.

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import qcportal as ptl
client = ptl.FractalClient()

Then checked the list of torsion drive dataset. There are many dataset is available.

client.list_collections("TorsionDriveDataset")

>Fragment Stability Benchmark	None
>OpenFF Fragmenter Phenyl Benchmark	Phenyl substituent torsional barrier heights.
>OpenFF Full TorsionDrive Benchmark 1	None
>OpenFF Group1 Torsions	None
>OpenFF Primary TorsionDrive Benchmark 1	None
>OpenFF Substituted Phenyl Set 1	None
>Pfizer Discrepancy Torsion Dataset 1	None
>SMIRNOFF Coverage Torsion Set 1	None
>TorsionDrive Paper	None

ds = client.get_collection("TorsionDriveDataset", "OpenFF Fragmenter Phenyl Benchmark")
ds.df.head()

>c1c[cH:1][c:2](cc1)[C:3](=[O:4])O
>c1[cH:1][c:2](cnc1)[C:3](=[O:4])O
>[cH:1]1cncc[c:2]1[C:3](=[O:4])O
>[cH:1]1cc(nc[c:2]1[C:3](=[O:4])O)[O-]
>Cc1c[cH:1][c:2](cn1)[C:3](=[O:4])O

OK I succeeded to loading data. Let’s visualize some completed dataset. RDKit is very useful package for drawing molecules!!!!!

complete_data = ds.status(["b3lyp-d3"], collapse=False, status="COMPLETE")
Draw.MolsToGridImage([Chem.MolFromSmiles(complete_data['B3LYP-D3'].index[i]) for i in range(10)],
                    molsPerRow=5)

Finally visualize torsion energy!

ds.visualize([complete_data['B3LYP-D3'].index[i] for i in range(10)],"B3LYP-D3", units="kJ / mol")

Purple line (4th structure) has highest torsion energy at -90, 90 degree.
The molecule is 5-Hydroxynicotinic acid. Hydroxyl group is located para-positon of carboxylic group. So conjugation effect to make relative energy higher than other structures.

The package is useful for not only data source of QC but also visualization and analysis of molecules.

I uploaded today’s code on my gist.