select diverse molecules from dataset.

To select subset from compound library, I think about molecular diversity.
So, I often use clustering .
There are many clustering method in chemo-informatics area.
But, today I used another approach. Use GA.
My strategy is that, evaluate diversity of molecules is sum of dissimilarity.
Let’s start.
For convenience, I used rdkit first_prop_200.sdf as dataset.

import scipy.misc as scm
import numpy as np
import random
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from deap import creator, base
from deap import algorithms
from deap import tools

mols = [mol for mol in Chem.SDMolSupplier('first_200.props.sdf')]
n_mols = len( mols )

# define diversity as sum of dissimilarity
def eval_diversity(individual):
    combination = scm.comb( len(individual),2 )
    fps =[ AllChem.GetMorganFingerprintAsBitVect(mols[i], 2) for i in individual ]
    for idx in range(1,len(fps)):
        # return distance returnDistance=1
        dis_sim = DataStructs.BulkCosineSimilarity(fps[idx], fps[:idx], returnDistance=1)
    return sum(dis_sim_mat)/combination,

1. I want to maximise dissimilarity
2. each chromosome has ten molecules.
3. population is 20.

creator.create( 'Fitness', base.Fitness, weights=(1.0,) )
creator.create( 'Individual', list, fitness = creator.Fitness )
toolbox = base.Toolbox()
#generate random int from range(200). It means select molecules in random.
toolbox.register('attr_item', random.randint, 0,n_mols)
# select 10 molecules from library and maximize diversity.
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_item, 10)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)

toolbox.register('evaluate', eval_diversity)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutShuffleIndexes, indpb = 0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

Then write main function.

def main():
    #set cross over and mutation probability and generation.
    CXPB, MUTPB, NGEN= 0.5, 0.2, 40
    # use all molecures.
    pop = toolbox.population(n=20)
    print('start evaluation !!!')
    fitnesses = list( map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses): = fit
    print('evaled %i inds' %len(pop))
    for g in range(NGEN):
        print('--GENERATION %i--' % g)
        # select the next generation individuals
        offspring = pop, len(pop) )
        # clone the selected individuals
        offspring = list( map(toolbox.clone, offspring) )
        # Apply crossover and mutaion on the offspring
        for  child1, child2 in zip( offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate( child1, child2 )
        for mutant in offspring:
            if random.random() < MUTPB:
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ ind for ind in offspring if not ]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
   = fit
        print(' evaled %i individuals '%len(invalid_ind))
        pop[:] = offspring
        fits = [[0] for ind in pop]
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2/length-mean**2)**0.5
        print('min %s' % min(fits))
        print ('max %s' % max(fits))
        print ('avg %s' % mean)
        print('std %s' % std)
    print('--end of evolution--')
    best_ind = tools.selBest(pop, 1)[0]
    print('bset individual is %s, %s' % (best_ind,

Then run script.

<out put from here>
start evaluation !!!
evaled 20 inds
 evaled 18 individuals 
min 0.765494194159
max 0.850630899091
avg 0.818827150217
std 0.0248685727781
 evaled 10 individuals 
min 0.912074571976
max 0.912074571976
avg 0.912074571976
std 2.58095682795e-08
--end of evolution--
bset individual is [41, 130, 1, 113, 5, 198, 94, 130, 93, 151], (0.91207457197601005,)

Now I got answer.
Check molecules.

res = [mols[i] for i in [41, 130, 1, 113, 5, 198, 94, 130, 93, 151]]
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
img = Draw.MolsToGridImage( res,molsPerRow=5,subImgSize=(200,200))

Ah …
Screen Shot 2015-02-08 at 10.48.01 PM

Now I got best dissimilarity score was 0.91207457197601005.
Compare with random fashion.

sample = random.sample(range(200),10)
>[37, 160, 157, 30, 163, 38, 112, 118, 135, 85]

Select molecules using GA is useful because, I could make evaluate function in myself.
Today I used only maximise dissimilarity, but I can also use maximise drug likeness or activity for target, or any more…
So I’ll study more deeply about deap.


Published by iwatobipen

I'm medicinal chemist in mid size of pharmaceutical company. I love chemoinfo, cording, organic synthesis, my family.

Leave a Reply

Fill in your details below or click an icon to log in: Logo

You are commenting using your account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d bloggers like this: