select diverse molecules from dataset.

To select subset from compound library, I think about molecular diversity.
So, I often use clustering .
There are many clustering method in chemo-informatics area.
But, today I used another approach. Use GA.
My strategy is that, evaluate diversity of molecules is sum of dissimilarity.
Let’s start.
For convenience, I used rdkit first_prop_200.sdf as dataset.

import scipy.misc as scm
import numpy as np
import random
from rdkit import Chem
from rdkit.Chem import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from deap import creator, base
from deap import algorithms
from deap import tools

mols = [mol for mol in Chem.SDMolSupplier('first_200.props.sdf')]
n_mols = len( mols )

# define diversity as sum of dissimilarity
def eval_diversity(individual):
    combination = scm.comb( len(individual),2 )
    dis_sim_mat=[]
    fps =[ AllChem.GetMorganFingerprintAsBitVect(mols[i], 2) for i in individual ]
    for idx in range(1,len(fps)):
        # return distance returnDistance=1
        dis_sim = DataStructs.BulkCosineSimilarity(fps[idx], fps[:idx], returnDistance=1)
        dis_sim_mat.extend(dis_sim)
    return sum(dis_sim_mat)/combination,

Points.
1. I want to maximise dissimilarity
2. each chromosome has ten molecules.
3. population is 20.

creator.create( 'Fitness', base.Fitness, weights=(1.0,) )
creator.create( 'Individual', list, fitness = creator.Fitness )
toolbox = base.Toolbox()
#generate random int from range(200). It means select molecules in random.
toolbox.register('attr_item', random.randint, 0,n_mols)
# select 10 molecules from library and maximize diversity.
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_item, 10)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)

toolbox.register('evaluate', eval_diversity)
toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutShuffleIndexes, indpb = 0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

Then write main function.

def main():
    random.seed(64)
    #set cross over and mutation probability and generation.
    CXPB, MUTPB, NGEN= 0.5, 0.2, 40
    # use all molecures.
    pop = toolbox.population(n=20)
    print('start evaluation !!!')
    fitnesses = list( map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    print('evaled %i inds' %len(pop))
    for g in range(NGEN):
        print('--GENERATION %i--' % g)
        # select the next generation individuals
        offspring = toolbox.select( pop, len(pop) )
        # clone the selected individuals
        offspring = list( map(toolbox.clone, offspring) )
        # Apply crossover and mutaion on the offspring
        for  child1, child2 in zip( offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate( child1, child2 )
                del child1.fitness.values
                del child2.fitness.values
        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                del mutant.fitness.values
        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ ind for ind in offspring if not ind.fitness.valid ]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
        print(' evaled %i individuals '%len(invalid_ind))
        pop[:] = offspring
        fits = [ind.fitness.values[0] for ind in pop]
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2/length-mean**2)**0.5
        print('min %s' % min(fits))
        print ('max %s' % max(fits))
        print ('avg %s' % mean)
        print('std %s' % std)
    print('--end of evolution--')
    best_ind = tools.selBest(pop, 1)[0]
    print('bset individual is %s, %s' % (best_ind, best_ind.fitness.values))

Ready.
Then run script.

main()
<out put from here>
start evaluation !!!
evaled 20 inds
--GENERATION 0--
 evaled 18 individuals 
min 0.765494194159
max 0.850630899091
avg 0.818827150217
std 0.0248685727781
........
 evaled 10 individuals 
min 0.912074571976
max 0.912074571976
avg 0.912074571976
std 2.58095682795e-08
--end of evolution--
bset individual is [41, 130, 1, 113, 5, 198, 94, 130, 93, 151], (0.91207457197601005,)
<end>

Now I got answer.
Check molecules.

res = [mols[i] for i in [41, 130, 1, 113, 5, 198, 94, 130, 93, 151]]
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
img = Draw.MolsToGridImage( res,molsPerRow=5,subImgSize=(200,200))
img

Ah …
Screen Shot 2015-02-08 at 10.48.01 PM

Now I got best dissimilarity score was 0.91207457197601005.
Compare with random fashion.

sample = random.sample(range(200),10)
sample
>[37, 160, 157, 30, 163, 38, 112, 118, 135, 85]
eval_diversity(sample)
>(0.82878571084365449,)

😉
Select molecules using GA is useful because, I could make evaluate function in myself.
Today I used only maximise dissimilarity, but I can also use maximise drug likeness or activity for target, or any more…
So I’ll study more deeply about deap.

GA with python

Genetic algorithm is sometime used for designing diversity compounds library set or optimising hyper parameter about QSAR etc….
I knew only one library about genetic algorithm in python ‘pyevolve’.
Some days ago, I found another library deap. Development is active.
I wrote sample to solve knapsack problem.
Deap is installed by using pip or easy_install.
Then, import module and set item-set.

from deap import base
from deap import creator
from deap import tools
import random
#GA
KNAP_SIZE       = 20

#itemdata
items = []
items.append({'id':'A','price': 800,'size':3})
items.append({'id':'B','price': 900,'size':5})
items.append({'id':'C','price':1100,'size':5})
items.append({'id':'D','price':1200,'size':7})
items.append({'id':'E','price':1800,'size':8})
max_size = sum([x['size'] for x in items])

In this problem, I want to maximise price and total weight maintain less than or equal to a given limit(KNAP_SIZE).
So, I set weights is positive value (1.0,). If, we want to minimise value, set weights negative.

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register('attr_bool', random.randint, 0, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_bool, len(items))
toolbox.register('population', tools.initRepeat, list, toolbox.individual)

Basic strategy is that set combination of items are represented as bit list.
For example, bit list [1, 1, 1, 0, 0] means that choice items A,B,C.
The function attr_bool is used to returns 0(not choice the item) or 1(choice the item) in random.
tools.initRepeat needs 3 argments 1. container, 2. function, 3. the number of times to repeat func.
Now I finished configuration of initial settings.
Next, define the functions for calculation price and size, and for evaluate these results.

def calc(individual):
    price = 0
    size = 0
    for x in xrange( len(individual) ):
        price += items[x]["price"] * individual[x]
        size += items[x]["size"] * individual[x]
    return (price, size)

def eval(individual):
    price, size = calc(individual)
    if KNAP_SIZE < size:
        over_score = 1.0 - float( KNAP_SIZE - size )/float(max_size)
        return max(0.0, over_score), # return tuple is important !! 
    return 1.0 + float( price ), # return tuple is important !!

Then set evaluate, cross over, mutation, selection in toolbox.
multiFlipBit is used for mutation. The function flip the value of the attributes of the input individual and return the mutant.
cxTwoPoint function is used to execute a two-point crossover on the input sequence individuals.
selTournament selects k individuals from the input individuals using k tournaments of tournsize individuals.
samples are….

toolbox.register('evaluate', eval)
toolbox.register('mate', tools.cxTwoPoints)
toolbox.register('mutate', tools.mutFlipBit, indpb=0.05)
toolbox.register('select', tools.selTournament, tournsize=3)

Finally build main function.

def main():
    random.seed(64)
    # generate first generation
    pop = toolbox.population(n=300)
    CXPB, MUTPB, NGEN = 0.5, 0.2, 40 # provability of cross over, mutation, num of generation to iterate.
    print("Start of evolution")

    # evaluate first generation. map function evaluate every individual.
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit

    print("  Evaluated %i individuals" % len(pop))

    # calc start
    for g in range(NGEN):
        print("-- Generation %i --" % g)

        # select individuals in each generation
        offspring = toolbox.select(pop, len(pop))
        # Clone the selected individuals
        offspring = list(map(toolbox.clone, offspring))

        # Apply crossover and mutation on the offspring
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < CXPB:
                toolbox.mate(child1, child2)
                # invalidate the fitness of the modified individuals.
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random.random() < MUTPB:
                toolbox.mutate(mutant)
                # invalidate the fitness of the modified individuals.
                del mutant.fitness.values

        # Evaluate the individuals with an invalid fitness
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit

        print("  Evaluated %i individuals" % len(invalid_ind))

        # replace the old population by the offspring
        pop[:] = offspring

        fits = [ind.fitness.values[0] for ind in pop]

        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5

        print("  Min %s" % min(fits))
        print("  Max %s" % max(fits))
        print("  Avg %s" % mean)
        print("  Std %s" % std)

    print("-- End of (successful) evolution --")

    best_ind = tools.selBest(pop, 1)[0]
    print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))

Run main().
Then, I got following answer.

Start of evolution
Evaluated 300 individuals
— Generation 0 —
Evaluated 173 individuals
Min 1.03571428571
Max 4101.0
Avg 2884.67595238
Std 1144.68904215
— Generation 1 —
Evaluated 201 individuals
Min 1.0
Max 4101.0
Avg 3071.01690476
Std 1283.6911134
— Generation 2 —
Evaluated 161 individuals
Min 1.03571428571
Max 4101.0
Avg 3185.02345238
Std 1377.43529536
— Generation 3 —
Evaluated 167 individuals
Min 1.03571428571
Max 4101.0
Avg 3377.35535714
Std 1297.27580227
— Generation 4 —
Evaluated 185 individuals
Min 1.10714285714
Max 4101.0
Avg 3477.68761905
Std 1263.92637085
— Generation 5 —
Evaluated 182 individuals
Min 1.10714285714
Max 4101.0
Avg 3847.6747619
Std 878.991815169
— Generation 6 —
Evaluated 180 individuals
Min 1.10714285714
Max 4101.0
Avg 4007.33535714
Std 514.111213481
— Generation 7 —
Evaluated 193 individuals
Min 2901.0
Max 4101.0
Avg 4073.66666667
Std 176.973318767
— Generation 8 —
Evaluated 192 individuals
Min 1.10714285714
Max 4101.0
Avg 4043.00035714
Std 329.595377539
— Generation 9 —
Evaluated 178 individuals
Min 1.10714285714
Max 4101.0
Avg 4002.66797619
Std 473.237158848
— Generation 10 —
Evaluated 168 individuals
Min 1.10714285714
Max 4101.0
Avg 4007.33583333
Std 558.426232918
— Generation 11 —
Evaluated 199 individuals
Min 1.10714285714
Max 4101.0
Avg 4038.00095238
Std 406.107511159
— Generation 12 —
Evaluated 182 individuals
Min 1.10714285714
Max 4101.0
Avg 3972.66904762
Std 616.991336905
— Generation 13 —
Evaluated 160 individuals
Min 1.10714285714
Max 4101.0
Avg 4024.66857143
Std 504.702447904
— Generation 14 —
Evaluated 190 individuals
Min 1.10714285714
Max 4101.0
Avg 4027.00071429
Std 410.266070554
— Generation 15 —
Evaluated 174 individuals
Min 1.10714285714
Max 4101.0
Avg 3906.0047619
Std 786.153808249
— Generation 16 —
Evaluated 182 individuals
Min 1.10714285714
Max 4101.0
Avg 4002.3352381
Std 533.650615415
— Generation 17 —
Evaluated 160 individuals
Min 1.10714285714
Max 4101.0
Avg 4013.00261905
Std 582.438825217
— Generation 18 —
Evaluated 175 individuals
Min 1.10714285714
Max 4101.0
Avg 3977.66869048
Std 584.613370624
— Generation 19 —
Evaluated 169 individuals
Min 1.17857142857
Max 4101.0
Avg 4039.00178571
Std 442.426919261
— Generation 20 —
Evaluated 179 individuals
Min 1.17857142857
Max 4101.0
Avg 3996.66845238
Std 515.040728778
— Generation 21 —
Evaluated 175 individuals
Min 1.10714285714
Max 4101.0
Avg 3974.0025
Std 603.504599863
— Generation 22 —
Evaluated 172 individuals
Min 1.10714285714
Max 4101.0
Avg 3989.6697619
Std 565.726846127
— Generation 23 —
Evaluated 165 individuals
Min 1.10714285714
Max 4101.0
Avg 3952.00357143
Std 688.963554238
— Generation 24 —
Evaluated 171 individuals
Min 1.10714285714
Max 4101.0
Avg 3936.00440476
Std 737.319696732
— Generation 25 —
Evaluated 185 individuals
Min 1.10714285714
Max 4101.0
Avg 3990.33630952
Std 609.47721591
— Generation 26 —
Evaluated 186 individuals
Min 1.10714285714
Max 4101.0
Avg 3954.33654762
Std 667.230701031
— Generation 27 —
Evaluated 189 individuals
Min 1.10714285714
Max 4101.0
Avg 3878.33988095
Std 866.776854377
— Generation 28 —
Evaluated 180 individuals
Min 1.10714285714
Max 4101.0
Avg 4018.66892857
Std 520.163161114
— Generation 29 —
Evaluated 200 individuals
Min 1.10714285714
Max 4101.0
Avg 3973.00309524
Std 625.293060829
— Generation 30 —
Evaluated 183 individuals
Min 1.10714285714
Max 4101.0
Avg 3998.0027381
Std 562.911282639
— Generation 31 —
Evaluated 167 individuals
Min 1.10714285714
Max 4101.0
Avg 4036.00154762
Std 440.373906617
— Generation 32 —
Evaluated 168 individuals
Min 1.10714285714
Max 4101.0
Avg 4011.33559524
Std 550.431721979
— Generation 33 —
Evaluated 194 individuals
Min 1.10714285714
Max 4101.0
Avg 3982.0027381
Std 582.852639817
— Generation 34 —
Evaluated 177 individuals
Min 1.10714285714
Max 4101.0
Avg 3995.33666667
Std 600.201016792
— Generation 35 —
Evaluated 189 individuals
Min 1.10714285714
Max 4101.0
Avg 4005.66940476
Std 549.869341172
— Generation 36 —
Evaluated 185 individuals
Min 1.10714285714
Max 4101.0
Avg 4003.66857143
Std 527.99397811
— Generation 37 —
Evaluated 178 individuals
Min 2301.0
Max 4101.0
Avg 4067.33333333
Std 222.934718895
— Generation 38 —
Evaluated 168 individuals
Min 1.10714285714
Max 4101.0
Avg 4060.00035714
Std 305.967483162
— Generation 39 —
Evaluated 175 individuals
Min 1.10714285714
Max 4101.0
Avg 4007.33559524
Std 554.113494983
— End of (successful) evolution —
Best individual is [0, 0, 1, 1, 1], (4101.0,)
The answer is choice C, D, E. 😉

ref link
http://deap.gel.ulaval.ca/doc/default/examples/ga_onemax.html
http://d.hatena.ne.jp/pashango_p/20090601/1243850103