# GLARE algorithm using RDKit with python3

Now version of RDKit has many tools. And I interested in the Glare algorithm.
https://github.com/rdkit/rdkit/blob/master/Contrib/Glare/glare.py

This algorithm is used for good quality library generation from large set of reagents.
In the method, key point is pre calculation of reagent properties and sum the value for product. So, It does not need calculate product property on the fly.
Details are described following article
http://pubs.acs.org/doi/pdf/10.1021/ci0504871

I tried to use the algorithm in my PC, but it did not work. Because of my environment is python 3.5.
There are some changes form python2 to python3. And the code has specific function for python2.
So, I added some changes to original code.

First, defined cmp function because there is not used cmp function in python3.
Second, changed sort key argument from cmp to cmp_to_key( cmp ).
3rd devision was changed. In the python2 9/8 return 1, but 9/8 return 1.125 in python3. So, I changed following code.
from
#sizes = [ (libIdx, max(rg.count()/total_num_partitions_per_rgroup, 1))
# for libIdx, rg in enumerate(self.rgroups) ]
to
sizes = [ (libIdx, max(rg.count()//total_num_partitions_per_rgroup, 1))
for libIdx, rg in enumerate(self.rgroups) ]

Finally modified code is following.
glare4py3.py

```from __future__ import print_function
import random, operator, itertools, math
# in the python3, need to call reduce from function
from functools import reduce, cmp_to_key
# in python3, cmp function is not used.
# xrange is equal to range.
def cmp( a,b ):
if a == b: return 0
return -1 if a < b else 1

"""
Glare Algorithm

Some nomenclature:

A Libary is made of RGroups
RGroups are a collection of sidechains (the paper uses Fragments)
that can populate the rgroup position.

We desire to optimize the Library so that we have a good chance
of making the desired products.

Example From the testing code, using Fake data:

r1 = RGroups(makeFakeSidechains("aldehydes", num=1000))
r2 = RGroups(makeFakeSidechains("boronic_acids", num=1500))

lib = Library([r1,r2])
props = [
Property("mw",    propIdx=0, minValue=0, maxValue=500),
Property("alogp", propIdx=1, minValue=-2.4, maxValue=5),
Property("tpsa",  propIdx=2, minValue=0, maxValue=90)
]

glare = Glare()
# optimize the library...
glare.optimize(lib, props)
"""

class Property:
def __init__(self, name, propIdx, minValue, maxValue, scaffoldoffset=0.0):
"""name, propIdx, minValue, maxValue, scaffoldoffset -> initial a Property
name is the name of the property.
propIdx:  the index of the property in the property vector
minValue: the minimum acceptable value for the property
maxValue: the maximum acceptable value for the property
scaffoldoffset: any offset from the reaction scaffold (defaults to 0)
"""
self.name = name
self.propIdx = propIdx
self.minValue = minValue
self.maxValue = maxValue
self.offset = scaffoldoffset

def evaluate(self, sidechains):
"""sidechains -> Evaluate a list of sidechains to see if they
pass the property values.

Each sidechain must have a property vector e.g. (s.props for s in sidechains)
which is a vector of values where s.props[propIdx] is the property
being inspected
"""
product = self.offset
propIdx = self.propIdx
for s in sidechains:
product += s.props[propIdx]
return self.minValue <= product <= self.maxValue

class Sidechain:
"""Holds the name (identifier) and property list for the
given sidechain/fragment.  Properties are assumed to
be numerical values"""
def __init__(self, name, props, goodCount=0):
"""name, props, goodCount=0 -> initialize a Sidechain
initialize a sidechain.
name: the unique name for the sidechain
props: the property vector (see Properties class for details)
goodCount: the number of times this reagent belongs to
a good product, where good is a product that is in the desired
property space.
"""
self.name = name
self.props = props
self.good_count = goodCount # shared variable
self.dropped = False        # shared variable

def __str__(self):
return "Sidechain %s(%s, goodCount=%s)"%(self.name,
self.props, self.good_count)
def __repr__(self):
return "Sidechain(%r, %r, %s)"%(self.name, self.props, self.good_count)

class RGroups:
"""Holds a collection of sidechains for the given RGroup"""
def __init__(self, sidechains):
"""Sidechains -> RGroups
sidechains: the list of Sidechains that make up the potential
sidechains at this rgroup position"""
self.sidechains = sidechains

self.rejected = []  # list of rejected sidechains
self.initial_size = len(sidechains)

def count(self):
"""Returns the number of possible sidechains"""
return len(self.sidechains)

def randomize(self):
"""Randomly shuffles the sidechains and reset the goodness counts"""
random.shuffle(self.sidechains)
for s in self.sidechains:
s.good_count = 0

def effectiveness(self):
"""-> return the current effectiveness of this collection
effectiveness is the number of items left divided by the
initial amount"""

return len(self.sidechains)/float(self.initial_size)

def chunk_size(self, num_chunks):
"""num_chunks -> return the number of sidechains in each chunk
if the sidechains are split into num_chunks chunks"""
return int(math.ceil(float(len(self.sidechains))/num_chunks))

def chunk(self, chunk_idx, num_chunks):
"""chunk_idx, num)chunks -> RGroups
return the chunk_idxth chunk given num_chunks total chunks"""
assert chunk_idx >=0  and chunk_idx < num_chunks, "%s %s"%(
chunk_idx, num_chunks)

n = self.chunk_size(num_chunks)
return RGroups(self.sidechains[chunk_idx*n:(chunk_idx+1)*n])

def prune(self, fractionToKeep):
"""fractionToKeep -> Sort the sidechains from the most often
found if good products to the least, and keep the best
fractionToKeep percentage"""
assert 0 < fractionToKeep <= 1.0, "fractionToKeep: %s"%fractionToKeep

self.sidechains.sort(key=cmp_to_key(lambda x,y: -cmp(x.good_count, y.good_count)))
fragment_index = int(len(self.sidechains) * fractionToKeep + 0.5)

# update rejected set
self.rejected += self.sidechains[fragment_index:]
self.sidechains = self.sidechains[:fragment_index]

class Library:
"""A library is a collection of RGroups that need to be combinitorially
combined"""
def __init__(self, rgroups):
"""rgroups -> Initialize the Library.
rgroups: the list of possible RGroups that is combinitorially
combined to make the library"""
self.rgroups = rgroups

def isValid(self):
"""If we have an empty set for any rgroup, return False"""
for rg in self.rgroups:
if len(rg.sidechains) == 0:
return False
return True

def randomize(self):
"""randomize the order of the sidechains"""
for rg in self.rgroups:
rg.randomize()

def getSidechainsPerPartition( self, total_num_partitions_per_rgroup ):
"""total_num_partitions -> [num_fragments/partition for rgroup1,
num_fragments/partition for rgroup2]
return the number of sidechains in a partition
for each rgroup"""
#sizes = [ (libIdx, max(rg.count()/total_num_partitions_per_rgroup, 1))
#          for libIdx, rg in enumerate(self.rgroups) ]

sizes = [ (libIdx, max(rg.count()//total_num_partitions_per_rgroup, 1))
for libIdx, rg in enumerate(self.rgroups) ]

# "optimially" apportion the partitions according the
#  the glare paper see Appendix eq (8) and (9)
# sort by size
sizes.sort(key=cmp_to_key(lambda x,y: cmp(x[1], y[1])))
last_size = 1
opt_sizes = []
for libIdx, current_size in sizes[:-1]:
opt_sizes.append( (libIdx,
current_size - (current_size % last_size)) )
last_size = current_size

# From the Glare paper:
# the last library size is set equal to the second to last
# From Table 3, it is easy to understand that, if the fourth dimension
# was split in 24 instead of 12, a factor of 2 would be gained from the
# reduced size of the sublibraries. However, twice as many sublibraries
# would be needed, and the net speedup would be null, hence, the decision to
# set p4=p3. (p4 here is the last library)
libIdx, current_size = sizes[-1]
opt_sizes.append((libIdx, last_size))
# back to the original library order
opt_sizes.sort()
res = [size for libIdx, size in opt_sizes]
return res

def chunk(self, num_partitions):
"""num_partitions -> [Library(..), Library(...)]

Return new libraries that are chunks of this one.
These are the libraries that get sampled to see of
sidechains participate in good products.
"""
partitions = self.getSidechainsPerPartition(num_partitions)
max_subsets = max(partitions)

enumeration_indices = []
# change xragen to range because there is no xragen function in python3.
for i in range(max_subsets):
combinations = []
for size in partitions:
combinations.append( i % size )
enumeration_indices.append( combinations )

library_sets = []
for subset_index, combinations in enumerate(enumeration_indices):
libs = []
partitioned_rgroups = []
for lib_index, libpart_index in enumerate(combinations):
lib = self.rgroups[lib_index]
num_chunks = partitions[lib_index]
partitioned_rgroups.append( lib.chunk(chunk_idx=libpart_index,
num_chunks=num_chunks))
lib = Library(partitioned_rgroups)
if lib.isValid():
library_sets.append(lib)

return library_sets

def effectiveness(self):
"""-> returns the average effectiveness of this library set"""
sum = 0.0
for rg in self.rgroups:
sum += rg.effectiveness()
return sum/len(self.rgroups)

def evaluate(self, props):
"""props -> num_good_enumerations, total_enumerations

props: a list of Property evaluators for the fragments.

returns the number of good enumerations and the total number of
enumerations for this Library"""
frags = [rg.sidechains for rg in self.rgroups]
good = 0
for i,frag in enumerate(itertools.product(*frags)):
for p in props:
if not p.evaluate(frag):
break
else:
good += 1
for sidechain in frag:
sidechain.good_count += 1
return good, i+1

class Glare:
"""Glare Algorithm.  Implementation of

GLARE: A New Approach for Filtering Large Reagent Lists in
Combinatorial Library Design Using Product Properties
Jean-Francois Truchon* and Christopher I. Bayly

http://pubs.acs.org/doi/pdf/10.1021/ci0504871

Usage:
# somehow make sidechains1/2 with props [mw, alogp, tpsa]
r1 = RGroups(sidechains1)
r2 = RGroups(sidechains2)
lib = Library([r1, r2])
props = [
Property("mw", 0, 0, 500),
Property("alogp", 1, -2.4, 5),
Property("tpsa", 2, 0, 90)
]

glare = Glare()
glare.optimize(lib, props)
"""
def __init__(self,
desiredFinalGoodness=0.95,
maxIterations=100,
rgroupScale=6.0, # None if no scaling
initialFraction=None,#None=auto -100.,
numPartitions=16):
self.fractionGood = self.desiredFinalGoodness = desiredFinalGoodness
self.maxIterations = maxIterations
self.rgroupScale = rgroupScale

if initialFraction is not None:
self.initialFraction = initialFraction/100.
else:
self.initialFraction = initialFraction
self.numPartitions = numPartitions

def optimize(self, library, props):
"""library, props
Given a Library and the list of Propery evaluators,
optimize the library.
The library is modified in place by removing building blocks
(sidechains) that are not likely to pass the property
criteria.
"""
# attempt to generate report like glare application
print ("------- PARAMETERS: --------------")
print ("GOOODNESS THRESHOLD : %s%%"%(self.desiredFinalGoodness * 100))
print ("MIN PARTITION SIZE : %s"%self.numPartitions)
if self.initialFraction is None or self.initialFraction > 0.999:
print ("INITIAL FRACTION TO KEEP : AUTOMATIC")
else:
print ("INITAL FRACTION TO KEEP : %s%%"%(self.initialFraction*100))
print ("Actual SIZE : %s = %s"%(
" x ".join([str(len(rg.sidechains)) for rg in library.rgroups]),
reduce(operator.mul, [len(rg.sidechains) for rg in library.rgroups])
))

running_total = 0.0
Gt = self.desiredFinalGoodness

for iteration in range(1, self.maxIterations+1):
# chunk of the total library into smaller more managable sets
#  and run combinitorial analysis on the sub libraries
#  each of these records the number of times a sidechain is used
#  in a successful enumeration which is then used to prune the
#  library at the end
#
for rg in library.rgroups:
rg.randomize()

good = total = 0.0
chunked_libs =  library.chunk(self.numPartitions)
# for each chunk, do the combinitorial check to see
#  if reagents make good products
for libidx, chunk in enumerate(chunked_libs):
g,t = chunk.evaluate(props)
good += g
total += t
running_total += total
Gi = good/total # current goodness

if Gi < 1e-12:
# I think we're done here :)
fraction = 0.0
elif iteration == 1:
G0 = Gi # Goodness at first iteration

# the first time, use the initalFraction or a "good enough"
#  value
if self.initialFraction is not None:
fraction = K0 = self.initialFraction
else:
# auto choose the fraction based on the current good percentage
#  and the desired
fraction = K0 = min(-1.1 * ( Gt - G0) + 1.2,
0.9)
else:
# the second time, gradually eliminate reagents slowing
#  down as the number of iterations increases
#  see equation (5) in reference
if abs(Gt-G0) < 1e-4:
Ki = 1.0
else:
Ki = (1.0 - K0) * (Gi - G0) / (Gt - G0) + K0;
fraction = min(1.0, Ki)

# prune the library to keep the highest occuring sidechains
#  note that even if all sidechains are acceptable,
#  some will always get pruned

max_size = float(max([len(rg.sidechains) for rg in library.rgroups]))
for rg in library.rgroups:
scale = 1.0
if self.rgroupScale is not None:
# scale differently size rgroups via equation (6)  in paper
numSidechains = len(rg.sidechains)
numer = 1.0
denom = 1.0 + math.exp(-self.rgroupScale *
((numSidechains/max_size) - 0.5))
scale = numer/denom
fraction_to_reject = (1.0 - fraction) * scale
# keep the best fraction...
rg.prune(1.0 - fraction_to_reject)

print ("-------------- ITERATION : %s ----------------------"%iteration)
print ("GOODNESS      : %s%%"%(Gi * 100))
print ("NUMBER EVAL   : %s"%(total))
print ("CUMUL EVAL    : %s"%(running_total))
print ("KEPT IN STEP  : %s%%"%(fraction*100.))
if not iteration:
print ("GOODNESS THRESHOLD : %s"%self.desiredFinalGoodness)
print ("MIN PARTITION SIZE : %s"%self.numPartitions)
print ("INITIAL FRACTION TO KEEP : ")
if self.fractionToKeep > 0.999:
print ("AUTOMATIC")
else:
print ("%s%%"%self.fractionToKeep)

print ("Actual SIZE : %s = %s"%(
" x ".join([str(len(rg.sidechains)) for rg in library.rgroups]),
reduce(operator.mul, [len(rg.sidechains) for rg in library.rgroups])
))
print ("EFFECTIVENESS : %s%%"%(library.effectiveness()*100.))

# stopping critieria
if iteration and Gi < 1e-12:
return
elif abs(Gi - self.desiredFinalGoodness) < 0.001 or \
Gi > self.desiredFinalGoodness:
return

######################################################################
# testing codes
def makeFakeProps():
mw = random.randint(10,500)
alogp = random.randint(-10,10)
tpsa = random.randint(0,180)
return [mw, alogp, tpsa]

def makeFakeSidechains(lib, num):
res = []
for i in range(num):
res.append(Sidechain(lib + "_" + str(i), makeFakeProps()))
return res

def testGlare():
a = RGroups(makeFakeSidechains("aldehydes", 1000))
b = RGroups(makeFakeSidechains("boronic_acids", 1500))

lib = Library([a,b])
props = [
Property("mw", 0, 0, 500, 230.1419),
Property("alogp", 1, -2.4, 5, 2.212749),
Property("tpsa", 2, 0, 90, 24.5)
]

glare = Glare()
glare.optimize(lib, props)

if __name__ == "__main__":
testGlare()
```

OK Let’s test.

```# call library and run the test code.

import glare4py3 as glare
glare.testGlare()

------- PARAMETERS: --------------
GOOODNESS THRESHOLD : 95.0%
MIN PARTITION SIZE : 16
INITIAL FRACTION TO KEEP : AUTOMATIC
Actual SIZE : 1000 x 1500 = 1500000
-------------- ITERATION : 1 ----------------------
GOODNESS      : 0.292%
NUMBER EVAL   : 25000.0
CUMUL EVAL    : 25000.0
KEPT IN STEP  : 15.821200000000001%
Actual SIZE : 385 x 297 = 114345
EFFECTIVENESS : 29.15%
-------------- ITERATION : 2 ----------------------
GOODNESS      : 0.9463233012721067%
NUMBER EVAL   : 6446.0
CUMUL EVAL    : 31446.0
KEPT IN STEP  : 16.40277864502602%
Actual SIZE : 78 x 89 = 6942
EFFECTIVENESS : 6.866666666666667%
-------------- ITERATION : 3 ----------------------
GOODNESS      : 7.183908045977011%
NUMBER EVAL   : 1740.0
CUMUL EVAL    : 33186.0
KEPT IN STEP  : 21.94689739642575%
Actual SIZE : 23 x 23 = 529
EFFECTIVENESS : 1.9166666666666665%
-------------- ITERATION : 4 ----------------------
GOODNESS      : 48.39319470699433%
NUMBER EVAL   : 529.0
.....
```

Works fine!
I pushed the code to my repo.
https://github.com/iwatobipen/sandbox/tree/master/sandbox

I will try to calculate my reagent set. ;-)