CHEMBL_MMP

またしてもMMPねた
RDKit のMMPモジュールを使ってCHEMBLのデータを全部くっつけるバージョン。
ただし今回はkinase_sarfariのみ。
準備として
https://www.ebi.ac.uk/chembl/sarfari/kinasesarfari
ここからデータを持ってきます。
ks_compound.txt
ks_bioactivity.txt
でディレクトリにrfrag.py, index.pyと上のデータを入れます。
くっつけるのは、、
chembl_mmp.pyとして

import pandas as pd
import numpy as np
import sys, os

#SAVE the smiles for RDKIT_MMPA.
data = pd.read_table( "ks_compound.txt", sep = "\t")
data = pd.DataFrame( data = data, columns= ["SMILES","COMPOUND_ID"] )
data.to_csv( "chembl.smi", sep = "\t", index = False, cols = [ "SMILES", "COMPOUND_ID" ] )

#read activity data and data clearning
data = pd.read_table( "ks_bioactivity.txt", sep = "\t" )
data = data[data.ACTIVITY_TYPE == "IC50"]
data = data[data.ASSAY_TYPE == "B"]
data = data[data.RELATION == "="]
data = pd.DataFrame(data = data, columns =[ "ACTIVITY_ID", "NAME", "COMPOUND_ID","ACTIVITY_TYPE","STANDARD_VALUE", "STANDARD_UNIT"])
#data.to_csv( "datadata.txt" , index=False)

#########################################
#separate activity data by each targets.
#########################################
target_set = []
for target in data.NAME:
    if target in target_set:
        pass
    else:
        target_set.append(target)

print len(target_set)

##########################################
# make target vs activity data dictionaly.
##########################################
splitdata = {}

for target in target_set:
    splitdata[target] = data[data.NAME == target]

#########################################
#make mmpa_set
#########################################
os.system( "python rfrag.py < chembl_nohead.smi > chembl_frag.txt" )
os.system( "python index.py < chembl_frag.txt > chembl_mmpa.txt" )
#now get mmps "chembl_mmpa.txt"
mmpa_data = pd.read_table("chembl_mmpa.txt", sep = "," ,header=None, names = ["SMILES_L","SMILES_R","ID_L","ID_R","TRANSFORM", "CONTEXT" ])

# make mmp_data frame that has each target activity data.
mmpa_act_dict = {}

for k, v in splitdata.items():
    merge_data = mmpa_data.merge( v, left_on = "ID_L", right_on ="COMPOUND_ID",how = "inner")
    merge_data = merge_data.rename( columns={ "ACTIVITY_ID":"ACTIVITY_ID_L",
                                   "NAME":"NAME_L",
                                   "COMPOUND_ID":"COMPOUND_ID_L",
                                   "ACTIVITY_TYPE":"ACTIVITY_TYPE_L",
                                   "STANDARD_VALUE":"STANDARD_VALUE_L",
                                   "STANDARD_UNIT":"STANDARD_UNIT_L"})

    merge_data = merge_data.merge( v, left_on = "ID_R", right_on="COMPOUND_ID",how = "inner")
    merge_data = merge_data.rename( columns={ "ACTIVITY_ID":"ACTIVITY_ID_R",
                                   "NAME":"NAME_R",
                                   "COMPOUND_ID":"COMPOUND_ID_R",
                                   "ACTIVITY_TYPE":"ACTIVITY_TYPE_R",
                                   "STANDARD_VALUE":"STANDARD_VALUE_R",
                                   "STANDARD_UNIT":"STANDARD_UNIT_R"})
    mmpa_act_dict[ k ] = merge_data


df_list = mmpa_act_dict.values()
final_data = pd.concat(df_list)
final_data.to_csv( "result.txt", index=False )

こんな感じ
ターゲットごとのMMPができます。

Advertisement

Published by iwatobipen

I'm medicinal chemist in mid size of pharmaceutical company. I love chemoinfo, cording, organic synthesis, my family.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s

This site uses Akismet to reduce spam. Learn how your comment data is processed.

%d bloggers like this: