Source code for microberx.MetabolitePredictor

"""
This is a module that provides the main classes to predicted metabolites using MicrobeRX.

The module contains the following classes:

- MetabolitePredictor:  A class for predicting metabolites using reaction rules.

- RunPredictionRule: A class for predicting metabolites based on a single reaction rule.
"""

__all__ = ["MetabolitePredictor", "RunPredictionRule"]

from .DataFiles import load_evidences, load_reaction_rules

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, DataStructs

import itertools, copy
import pandas as pd

import datamol as dm

from tqdm.notebook import tqdm

rdkit.RDLogger.DisableLog("rdApp.*")


[docs]class MetabolitePredictor:
    """
    A class for predicting metabolites using reaction rules.

    Parameters
    ----------
    rules_table : str
        The path to a table containing reaction rules and associated information.

    Attributes
    ----------
    predicted_metabolites : pd.DataFrame
        A DataFrame to store predicted metabolites and associated information.
    query : Chem.Mol
        The query molecule for metabolite prediction.
    query_name : str
        The name associated with the query molecule.
    query_atoms_num : int
        The number of heavy atoms in the query molecule.
    reacting_atoms : list
        A list to store reacting atom indices.
    reacting_atoms_in_unique_metabolites : list
        A list to store reacting atom indices in unique metabolites.

    Methods
    -------
    run_prediction(query: Chem.Mol, name: str = "metabolite")
        Run the metabolite prediction using the provided query molecule and name.
    """
    
    def __init__(self, query: Chem.Mol, query_name: str = "metabolite", cut_off:float=0.6,biosystem:str="all"):
        """
        Initialize a MetabolitePredictor instance.

        Parameters
        ----------
        rules_table : pd.DataFrame
            The path to a table containing reaction rules and associated information.
        """
        
[docs]        self.biosystem = biosystem
[docs]        self.predicted_metabolites = pd.DataFrame()
[docs]        self.query = query
[docs]        self.query_name = query_name
[docs]        self.query_atoms_num = query.GetNumHeavyAtoms()
[docs]        self.reacting_atoms = []
[docs]        self.reacting_atoms_in_unique_metabolites = []
[docs]        self.cut_off=cut_off
        
        evidences = load_evidences()
        rules_table = load_reaction_rules()
        
        rules_table['bigg_reaction']=[r.replace("_LR","").replace("_RL","") for r in rules_table.reaction_id]
        
        
        if self.biosystem == "all":
            self.evidences = evidences.groupby('reaction_id')[['origin','compartment','name','subsystem','reversibility','ec','metanetx_reaction','seed_reaction','rhea','kegg_reaction','pubmed']].agg(lambda x: ';'.join([i for i in set(x) if isinstance(i,str)])).reset_index()
            
            self.rules_table = rules_table[rules_table["bigg_reaction"].isin(self.evidences.reaction_id.unique())]
            
        if self.biosystem == "human":
            evidences = evidences[['reaction_id','origin','compartment','name','subsystem','reversibility','ec','metanetx_reaction','seed_reaction','rhea','kegg_reaction','pubmed']]
            self.evidences = evidences[evidences.origin=="Human"]
            self.rules_table = rules_table[rules_table["bigg_reaction"].isin(self.evidences.reaction_id.unique())]
        
        if self.biosystem == "gutmicrobes":
            evidences = evidences[['reaction_id','origin','compartment','name','subsystem','reversibility','ec','metanetx_reaction','seed_reaction','rhea','kegg_reaction','pubmed']]
            self.evidences = evidences[evidences.origin=="GutMicrobes"]
            self.rules_table = rules_table[rules_table["bigg_reaction"].isin(self.evidences.reaction_id.unique())]

[docs]    def run_prediction(self):
        """
        This method performs metabolite prediction using reaction rules from the rules table. It calculates confidence scores for predicted metabolites and stores the results in the 'predicted_metabolites' attribute.

        Parameters
        ----------
        query : Chem.Mol
            The query molecule for metabolite prediction.
        query_name : str, optional
            The name associated with the query molecule. Default is "metabolite".

        Returns
        -------
        None
        
        Example
        -------
        >>> predictor = MetabolitePredictor(rules_table)
        >>> query_molecule = Chem.MolFromSmiles("CC(=O)O")
        >>> predictor.run_prediction(query_molecule, "acetate")
        >>> predicted_metabolites_df = predictor.predicted_metabolites
        """

        for index in tqdm(self.rules_table.index):
            # try:
            Predictor = RunPredictionRule(
                query=self.query,
                rule_smarts=self.rules_table.rule[index],
                real_product=self.rules_table.product_map[index],
                real_substrate=self.rules_table.substrate_map[index],
            )
            Predictor.predict()
            results = Predictor.unique_products
            if results:
                tmp_table = pd.DataFrame(results.values())
                tmp_table["reaction_id"] = self.rules_table["reaction_id"][index]
                tmp_table["substrate"] = self.rules_table.substrate[index]
                tmp_table["product"] = self.rules_table["product"][index]
                tmp_table["num_atoms"] = self.rules_table.num_atoms[index]
                tmp_table["bigg_reaction"] = self.rules_table["bigg_reaction"][index]
                tmp_table["reacting_atoms_efficiency"] = round(
                    self.rules_table.num_atoms[index] / self.query_atoms_num, 3
                )
                self.predicted_metabolites = pd.concat(
                    [self.predicted_metabolites, tmp_table], ignore_index=True
                )
        # except Exception:
        # pass

        self.predicted_metabolites["confidence_score"] = round(self.predicted_metabolites.similarity_substrates
                                                               + self.predicted_metabolites.similarity_products
                                                               + self.predicted_metabolites.reacting_atoms_efficiency,3,)
        
        self.predicted_metabolites.sort_values(by="confidence_score", ascending=False, ignore_index=True, inplace=True)
        
        
        
        self.predicted_metabolites.drop_duplicates(subset=["reaction_id", "substrate", "product", "main_product_smiles"],
                                                   keep="first",
                                                   inplace=True,
                                                   ignore_index=True)

        self.predicted_metabolites = self.predicted_metabolites[self.predicted_metabolites.confidence_score>=self.cut_off]
        
        un_mets = self.predicted_metabolites.drop_duplicates(
            subset=["main_product_smiles"], keep="first", ignore_index=True)
        
        self.unique_metabolites = copy.deepcopy(un_mets)
        
        metabolite_rank = {self.unique_metabolites.main_product_smiles[i]: f"{self.query_name}_{i+1}" for i in self.unique_metabolites.index}

        self.unique_metabolites["metabolite_id"] = self.unique_metabolites.main_product_smiles.map(metabolite_rank)
        
        self.predicted_metabolites["metabolite_id"] = self.predicted_metabolites.main_product_smiles.map(metabolite_rank)
        
        self.predicted_metabolites=self.predicted_metabolites.merge(self.evidences,left_on='bigg_reaction',right_on='reaction_id',how='left')
        
        self.predicted_metabolites.drop(columns=["reaction_id_y"],inplace=True)
        self.predicted_metabolites.rename(columns={"reaction_id_x":"reaction_id"},inplace=True)


[docs]class RunPredictionRule:
    """
    A class for predicting reactions based on reaction rules.

    Parameters
    ----------
    query : Chem.Mol
        The query molecule for prediction.
    rule_smarts : str
        The reaction rule in SMARTS format.
    real_product : str
        The SMILES representation of the real product.
    real_substrate : str
        The SMILES representation of the real substrate.

    Attributes
    ----------
    query : Chem.Mol
        The query molecule for prediction.
    rule_smarts : str
        The reaction rule in SMARTS format.
    reaction : AllChem.Reaction
        The reaction object created from the reaction rule.
    real_product : Chem.Mol
        The real product molecule.
    real_substrate : Chem.Mol
        The real substrate molecule.
    unique_products : dict
        A dictionary to store information about unique predicted products.

    Methods
    -------
    predict()
        Predict reaction products and populate the 'unique_products' attribute.

    """
    def __init__(self, query: Chem.Mol, rule_smarts: str, real_product: str, real_substrate: str):
        """
        Initialize a RunPredictionRule instance.

        Parameters
        ----------
        query : Chem.Mol
            The query molecule for prediction.
        rule_smarts : str
            The reaction rule in SMARTS format.
        real_product : str
            The SMILES representation of the real product.
        real_substrate : str
            The SMILES representation of the real substrate.
        """
[docs]        self.query = query
[docs]        self.rule_smarts = rule_smarts
[docs]        self.reaction = AllChem.ReactionFromSmarts(self.rule_smarts, useSmiles=False)
        self.reaction.Initialize()
[docs]        self.real_product = Chem.MolFromSmiles(real_product)
[docs]        self.real_substrate = Chem.MolFromSmiles(real_substrate)

[docs]    def predict(self):
        """
        This method predicts reaction products based on the provided query molecule, reaction rule, real product, and real substrate. It calculates various properties and stores the results in the 'unique_products' dictionary.

        Returns
        -------
        None
        """
        self._main_reactant = self.reaction.GetReactantTemplate(0)
        num_substructure_matches = len(
            self.query.GetSubstructMatches(self._main_reactant)
        )
        self.unique_products = {}
        if num_substructure_matches > 0:
            self._atom_map = [
                a.GetAtomMapNum()
                for a in self._main_reactant.GetAtoms()
                if a.GetAtomMapNum() != 0
            ]

            results = self.reaction.RunReactants([self.query], maxProducts=10)

            for products in results:
                try:
                    main_product = products[0]
                    fixed_mol = dm.fix_mol(main_product)
                    fixed_mol = dm.sanitize_mol(fixed_mol)
                    fixed_mol = dm.standardize_mol(fixed_mol)
                    smi = Chem.MolToSmiles(
                        fixed_mol, isomericSmiles=False, canonical=True
                    )

                    (
                        main_product_atom_indexes,
                        query_atoms_indexes,
                    ) = self.__get_product_atom_indexes(main_product)

                    real_product_atom_indexes = [
                        a.GetIdx()
                        for a in self.real_product.GetAtoms()
                        if a.GetAtomMapNum() in self._atom_map
                    ]
                    real_substrate_atom_indexes = [
                        a.GetIdx()
                        for a in self.real_substrate.GetAtoms()
                        if a.GetAtomMapNum() in self._atom_map
                    ]

                    query_substructure = self.__get_mol_substructure(
                        self.query, query_atoms_indexes
                    )
                    real_substrate_substructure = self.__get_mol_substructure(
                        self.real_substrate, real_substrate_atom_indexes
                    )
                    main_product_substructure = self.__get_mol_substructure(
                        main_product, main_product_atom_indexes
                    )
                    real_product_substructure = self.__get_mol_substructure(
                        self.real_product, real_product_atom_indexes
                    )

                    self.unique_products[smi] = {
                        "main_product_smiles": smi,
                        "secondary_products_smiles": ".".join(
                            [Chem.MolToSmiles(p, canonical=True) for p in products[1:]]
                        ),
                        "similarity_substrates": self.__compute_similarity(
                            self.query, self.real_substrate, fingerPrintModel="rdkit"
                        ),
                        #'similarity_substrates_substructure':self.__compute_similarity(query_substructure,real_substrate_substructure,fingerPrintModel='rdkit'),
                        "similarity_products": self.__compute_similarity(
                            main_product, self.real_product, fingerPrintModel="rdkit"
                        ),
                        #'similarity_products_substructure':self.__compute_similarity(main_product_substructure,real_product_substructure,fingerPrintModel='rdkit'),
                        "reacting_atoms_in_query": query_atoms_indexes,
                    }
                except Exception as e:
                    pass

[docs]    def __get_product_atom_indexes(self, product: Chem.Mol):
        product_atoms_indexes = []
        query_atoms_indexes = []
        for atom in product.GetAtoms():
            atom_properties = atom.GetPropsAsDict()
            if set(["old_mapno", "react_atom_idx"]).issubset(
                set(atom_properties.keys())
            ):
                product_atoms_indexes.append(atom.GetIdx())
                query_atoms_indexes.append(int(atom.GetProp("react_atom_idx")))
        return product_atoms_indexes, query_atoms_indexes

[docs]    def __get_mol_substructure(self, mol_target: Chem.Mol, atom_indexes_to_keep):
        """
        Get the reaction substructure from the input molecule.
        :return: The reaction substructure
        """
        atom_indexes_to_keep = self.__get_substructure_neighbors(
            mol_target, atom_indexes_to_keep
        )
        atom_indexes_to_remove = [
            atom.GetIdx()
            for atom in mol_target.GetAtoms()
            if atom.GetIdx() not in atom_indexes_to_keep
        ]
        substructure = self.__trim_mol(mol_target, atom_indexes_to_remove)

        return Chem.MolFromSmiles(Chem.MolToSmiles(substructure))

[docs]    def __get_substructure_neighbors(self, mol_target: Chem.Mol, atom_indexes_to_keep):
        """
        Get the substructure neighbors of the input molecule.
        :return: The substructure neighbors
        """

        neighbor_atoms = []
        rings = mol_target.GetRingInfo()
        for atom_index in atom_indexes_to_keep:
            atom = mol_target.GetAtomWithIdx(atom_index)
            neighbor_atoms.extend(
                [neighbor.GetIdx() for neighbor in atom.GetNeighbors()]
            )
            neighbor_atoms.extend(
                [
                    neighbor_neighbor.GetIdx()
                    for neighbor in atom.GetNeighbors()
                    for neighbor_neighbor in neighbor.GetNeighbors()
                ]
            )
            neighbor_atoms.extend(
                [a for ring in rings.AtomRings() for a in ring if atom_index in ring]
            )

        unique_atom_pairs = list(itertools.combinations(set(neighbor_atoms), r=2))
        atom_path = []
        for i, j in unique_atom_pairs:
            atom_path += Chem.GetShortestPath(mol_target, i, j)

        return list(set(atom_path))

[docs]    def __trim_mol(self, mol_target: Chem.Mol, atom_indexes_to_remove):
        """
        Trim the input molecule to get the desired substructure.
        :param atoms_to_remove: The atoms to remove from the input molecule
        :return: The trimmed molecule
        """
        mol_target = copy.deepcopy(mol_target)
        Chem.SanitizeMol(mol_target)
        Chem.Kekulize(mol_target, clearAromaticFlags=True)

        editable_mol = Chem.EditableMol(mol_target)
        for atom_id in sorted(atom_indexes_to_remove, reverse=True):
            editable_mol.RemoveAtom(atom_id)

        return editable_mol.GetMol()

[docs]    def __compute_similarity(
        self, mol1: Chem.Mol, mol2: Chem.Mol, fingerPrintModel: str = "maccskey"
    ):
        if fingerPrintModel == "rdkit":
            similarity = DataStructs.FingerprintSimilarity(
                Chem.RDKFingerprint(mol1), Chem.RDKFingerprint(mol2)
            )
        if fingerPrintModel == "maccskey":
            similarity = DataStructs.FingerprintSimilarity(
                MACCSkeys.GenMACCSKeys(mol1), MACCSkeys.GenMACCSKeys(mol2)
            )

        return round(similarity, 3)