Source code for microberx.MetabolitePredictor

"""
This is a module that provides the main classes to predicted metabolites using MicrobeRX.

The module contains the following classes:

- MetabolitePredictor:  A class for predicting metabolites using reaction rules.

- RunPredictionRule: A class for predicting metabolites based on a single reaction rule.
"""

__all__ = ["MetabolitePredictor", "RunPredictionRule"]

from .DataFiles import load_evidences, load_reaction_rules

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, DataStructs

import itertools, copy
import pandas as pd

import datamol as dm

from tqdm.notebook import tqdm

rdkit.RDLogger.DisableLog("rdApp.*")


[docs]class MetabolitePredictor: """ A class for predicting metabolites using reaction rules. Parameters ---------- rules_table : str The path to a table containing reaction rules and associated information. Attributes ---------- predicted_metabolites : pd.DataFrame A DataFrame to store predicted metabolites and associated information. query : Chem.Mol The query molecule for metabolite prediction. query_name : str The name associated with the query molecule. query_atoms_num : int The number of heavy atoms in the query molecule. reacting_atoms : list A list to store reacting atom indices. reacting_atoms_in_unique_metabolites : list A list to store reacting atom indices in unique metabolites. Methods ------- run_prediction(query: Chem.Mol, name: str = "metabolite") Run the metabolite prediction using the provided query molecule and name. """ def __init__(self, query: Chem.Mol, query_name: str = "metabolite", cut_off:float=0.6,biosystem:str="all"): """ Initialize a MetabolitePredictor instance. Parameters ---------- rules_table : pd.DataFrame The path to a table containing reaction rules and associated information. """
[docs] self.biosystem = biosystem
[docs] self.predicted_metabolites = pd.DataFrame()
[docs] self.query = query
[docs] self.query_name = query_name
[docs] self.query_atoms_num = query.GetNumHeavyAtoms()
[docs] self.reacting_atoms = []
[docs] self.reacting_atoms_in_unique_metabolites = []
[docs] self.cut_off=cut_off
evidences = load_evidences() rules_table = load_reaction_rules() rules_table['bigg_reaction']=[r.replace("_LR","").replace("_RL","") for r in rules_table.reaction_id] if self.biosystem == "all": self.evidences = evidences.groupby('reaction_id')[['origin','compartment','name','subsystem','reversibility','ec','metanetx_reaction','seed_reaction','rhea','kegg_reaction','pubmed']].agg(lambda x: ';'.join([i for i in set(x) if isinstance(i,str)])).reset_index() self.rules_table = rules_table[rules_table["bigg_reaction"].isin(self.evidences.reaction_id.unique())] if self.biosystem == "human": evidences = evidences[['reaction_id','origin','compartment','name','subsystem','reversibility','ec','metanetx_reaction','seed_reaction','rhea','kegg_reaction','pubmed']] self.evidences = evidences[evidences.origin=="Human"] self.rules_table = rules_table[rules_table["bigg_reaction"].isin(self.evidences.reaction_id.unique())] if self.biosystem == "gutmicrobes": evidences = evidences[['reaction_id','origin','compartment','name','subsystem','reversibility','ec','metanetx_reaction','seed_reaction','rhea','kegg_reaction','pubmed']] self.evidences = evidences[evidences.origin=="GutMicrobes"] self.rules_table = rules_table[rules_table["bigg_reaction"].isin(self.evidences.reaction_id.unique())]
[docs] def run_prediction(self): """ This method performs metabolite prediction using reaction rules from the rules table. It calculates confidence scores for predicted metabolites and stores the results in the 'predicted_metabolites' attribute. Parameters ---------- query : Chem.Mol The query molecule for metabolite prediction. query_name : str, optional The name associated with the query molecule. Default is "metabolite". Returns ------- None Example ------- >>> predictor = MetabolitePredictor(rules_table) >>> query_molecule = Chem.MolFromSmiles("CC(=O)O") >>> predictor.run_prediction(query_molecule, "acetate") >>> predicted_metabolites_df = predictor.predicted_metabolites """ for index in tqdm(self.rules_table.index): # try: Predictor = RunPredictionRule( query=self.query, rule_smarts=self.rules_table.rule[index], real_product=self.rules_table.product_map[index], real_substrate=self.rules_table.substrate_map[index], ) Predictor.predict() results = Predictor.unique_products if results: tmp_table = pd.DataFrame(results.values()) tmp_table["reaction_id"] = self.rules_table["reaction_id"][index] tmp_table["substrate"] = self.rules_table.substrate[index] tmp_table["product"] = self.rules_table["product"][index] tmp_table["num_atoms"] = self.rules_table.num_atoms[index] tmp_table["bigg_reaction"] = self.rules_table["bigg_reaction"][index] tmp_table["reacting_atoms_efficiency"] = round( self.rules_table.num_atoms[index] / self.query_atoms_num, 3 ) self.predicted_metabolites = pd.concat( [self.predicted_metabolites, tmp_table], ignore_index=True ) # except Exception: # pass self.predicted_metabolites["confidence_score"] = round(self.predicted_metabolites.similarity_substrates + self.predicted_metabolites.similarity_products + self.predicted_metabolites.reacting_atoms_efficiency,3,) self.predicted_metabolites.sort_values(by="confidence_score", ascending=False, ignore_index=True, inplace=True) self.predicted_metabolites.drop_duplicates(subset=["reaction_id", "substrate", "product", "main_product_smiles"], keep="first", inplace=True, ignore_index=True) self.predicted_metabolites = self.predicted_metabolites[self.predicted_metabolites.confidence_score>=self.cut_off] un_mets = self.predicted_metabolites.drop_duplicates( subset=["main_product_smiles"], keep="first", ignore_index=True) self.unique_metabolites = copy.deepcopy(un_mets) metabolite_rank = {self.unique_metabolites.main_product_smiles[i]: f"{self.query_name}_{i+1}" for i in self.unique_metabolites.index} self.unique_metabolites["metabolite_id"] = self.unique_metabolites.main_product_smiles.map(metabolite_rank) self.predicted_metabolites["metabolite_id"] = self.predicted_metabolites.main_product_smiles.map(metabolite_rank) self.predicted_metabolites=self.predicted_metabolites.merge(self.evidences,left_on='bigg_reaction',right_on='reaction_id',how='left') self.predicted_metabolites.drop(columns=["reaction_id_y"],inplace=True) self.predicted_metabolites.rename(columns={"reaction_id_x":"reaction_id"},inplace=True)
[docs]class RunPredictionRule: """ A class for predicting reactions based on reaction rules. Parameters ---------- query : Chem.Mol The query molecule for prediction. rule_smarts : str The reaction rule in SMARTS format. real_product : str The SMILES representation of the real product. real_substrate : str The SMILES representation of the real substrate. Attributes ---------- query : Chem.Mol The query molecule for prediction. rule_smarts : str The reaction rule in SMARTS format. reaction : AllChem.Reaction The reaction object created from the reaction rule. real_product : Chem.Mol The real product molecule. real_substrate : Chem.Mol The real substrate molecule. unique_products : dict A dictionary to store information about unique predicted products. Methods ------- predict() Predict reaction products and populate the 'unique_products' attribute. """ def __init__(self, query: Chem.Mol, rule_smarts: str, real_product: str, real_substrate: str): """ Initialize a RunPredictionRule instance. Parameters ---------- query : Chem.Mol The query molecule for prediction. rule_smarts : str The reaction rule in SMARTS format. real_product : str The SMILES representation of the real product. real_substrate : str The SMILES representation of the real substrate. """
[docs] self.query = query
[docs] self.rule_smarts = rule_smarts
[docs] self.reaction = AllChem.ReactionFromSmarts(self.rule_smarts, useSmiles=False)
self.reaction.Initialize()
[docs] self.real_product = Chem.MolFromSmiles(real_product)
[docs] self.real_substrate = Chem.MolFromSmiles(real_substrate)
[docs] def predict(self): """ This method predicts reaction products based on the provided query molecule, reaction rule, real product, and real substrate. It calculates various properties and stores the results in the 'unique_products' dictionary. Returns ------- None """ self._main_reactant = self.reaction.GetReactantTemplate(0) num_substructure_matches = len( self.query.GetSubstructMatches(self._main_reactant) ) self.unique_products = {} if num_substructure_matches > 0: self._atom_map = [ a.GetAtomMapNum() for a in self._main_reactant.GetAtoms() if a.GetAtomMapNum() != 0 ] results = self.reaction.RunReactants([self.query], maxProducts=10) for products in results: try: main_product = products[0] fixed_mol = dm.fix_mol(main_product) fixed_mol = dm.sanitize_mol(fixed_mol) fixed_mol = dm.standardize_mol(fixed_mol) smi = Chem.MolToSmiles( fixed_mol, isomericSmiles=False, canonical=True ) ( main_product_atom_indexes, query_atoms_indexes, ) = self.__get_product_atom_indexes(main_product) real_product_atom_indexes = [ a.GetIdx() for a in self.real_product.GetAtoms() if a.GetAtomMapNum() in self._atom_map ] real_substrate_atom_indexes = [ a.GetIdx() for a in self.real_substrate.GetAtoms() if a.GetAtomMapNum() in self._atom_map ] query_substructure = self.__get_mol_substructure( self.query, query_atoms_indexes ) real_substrate_substructure = self.__get_mol_substructure( self.real_substrate, real_substrate_atom_indexes ) main_product_substructure = self.__get_mol_substructure( main_product, main_product_atom_indexes ) real_product_substructure = self.__get_mol_substructure( self.real_product, real_product_atom_indexes ) self.unique_products[smi] = { "main_product_smiles": smi, "secondary_products_smiles": ".".join( [Chem.MolToSmiles(p, canonical=True) for p in products[1:]] ), "similarity_substrates": self.__compute_similarity( self.query, self.real_substrate, fingerPrintModel="rdkit" ), #'similarity_substrates_substructure':self.__compute_similarity(query_substructure,real_substrate_substructure,fingerPrintModel='rdkit'), "similarity_products": self.__compute_similarity( main_product, self.real_product, fingerPrintModel="rdkit" ), #'similarity_products_substructure':self.__compute_similarity(main_product_substructure,real_product_substructure,fingerPrintModel='rdkit'), "reacting_atoms_in_query": query_atoms_indexes, } except Exception as e: pass
[docs] def __get_product_atom_indexes(self, product: Chem.Mol): product_atoms_indexes = [] query_atoms_indexes = [] for atom in product.GetAtoms(): atom_properties = atom.GetPropsAsDict() if set(["old_mapno", "react_atom_idx"]).issubset( set(atom_properties.keys()) ): product_atoms_indexes.append(atom.GetIdx()) query_atoms_indexes.append(int(atom.GetProp("react_atom_idx"))) return product_atoms_indexes, query_atoms_indexes
[docs] def __get_mol_substructure(self, mol_target: Chem.Mol, atom_indexes_to_keep): """ Get the reaction substructure from the input molecule. :return: The reaction substructure """ atom_indexes_to_keep = self.__get_substructure_neighbors( mol_target, atom_indexes_to_keep ) atom_indexes_to_remove = [ atom.GetIdx() for atom in mol_target.GetAtoms() if atom.GetIdx() not in atom_indexes_to_keep ] substructure = self.__trim_mol(mol_target, atom_indexes_to_remove) return Chem.MolFromSmiles(Chem.MolToSmiles(substructure))
[docs] def __get_substructure_neighbors(self, mol_target: Chem.Mol, atom_indexes_to_keep): """ Get the substructure neighbors of the input molecule. :return: The substructure neighbors """ neighbor_atoms = [] rings = mol_target.GetRingInfo() for atom_index in atom_indexes_to_keep: atom = mol_target.GetAtomWithIdx(atom_index) neighbor_atoms.extend( [neighbor.GetIdx() for neighbor in atom.GetNeighbors()] ) neighbor_atoms.extend( [ neighbor_neighbor.GetIdx() for neighbor in atom.GetNeighbors() for neighbor_neighbor in neighbor.GetNeighbors() ] ) neighbor_atoms.extend( [a for ring in rings.AtomRings() for a in ring if atom_index in ring] ) unique_atom_pairs = list(itertools.combinations(set(neighbor_atoms), r=2)) atom_path = [] for i, j in unique_atom_pairs: atom_path += Chem.GetShortestPath(mol_target, i, j) return list(set(atom_path))
[docs] def __trim_mol(self, mol_target: Chem.Mol, atom_indexes_to_remove): """ Trim the input molecule to get the desired substructure. :param atoms_to_remove: The atoms to remove from the input molecule :return: The trimmed molecule """ mol_target = copy.deepcopy(mol_target) Chem.SanitizeMol(mol_target) Chem.Kekulize(mol_target, clearAromaticFlags=True) editable_mol = Chem.EditableMol(mol_target) for atom_id in sorted(atom_indexes_to_remove, reverse=True): editable_mol.RemoveAtom(atom_id) return editable_mol.GetMol()
[docs] def __compute_similarity( self, mol1: Chem.Mol, mol2: Chem.Mol, fingerPrintModel: str = "maccskey" ): if fingerPrintModel == "rdkit": similarity = DataStructs.FingerprintSimilarity( Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2) ) if fingerPrintModel == "maccskey": similarity = DataStructs.FingerprintSimilarity( MACCSkeys.GenMACCSKeys(mol1), MACCSkeys.GenMACCSKeys(mol2) ) return round(similarity, 3)