"""
This module contains functions to compute and analyze molecular properties and descriptors of metabolites using various libraries and web services.
The module has the following functions:
- compute_molecular_descriptors: Computes some molecular descriptors for a given data frame of SMILES strings.
- compute_isotopic_mass: Computes the isotopic mass distribution of a given data frame using the pyOpenMS library.
- search_pubchem: Searches the PubChem database for compounds that match a given data frame of identifiers.
- classify_molecules: Classify molecules based on their SMILES strings using the ClassyFire web service.
"""
__all__ = [
"compute_molecular_descriptors",
"compute_isotopic_mass",
"search_pubchem",
"classify_molecules"
]
import requests, copy, random, time
import pandas as pd
import numpy as np
import pubchempy as pcp
from pyopenms import *
import plotly.express as px
import plotly.graph_objects as go
from distinctipy import distinctipy
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from rdkit.Chem.FilterCatalog import *
from tqdm.notebook import tqdm
[docs]def compute_molecular_descriptors(data_frame: pd.DataFrame, smiles_col: str):
"""
Computes some molecular descriptors and filters for a given data frame of SMILES strings.
Parameters
----------
data_frame : pd.DataFrame
A pandas data frame that contains SMILES strings of molecules.
smiles_col : str
The name of the column that contains the SMILES strings.
Returns
-------
data_frame : pd.DataFrame
The same data frame as input, but with additional columns for each molecular descriptor and filter computed. The descriptors and filters are:
- MolWt: the molecular weight of the molecule
- LogP: the octanol-water partition coefficient of the molecule
- NumHAcceptors: the number of hydrogen bond acceptors in the molecule
- NumHDonors: the number of hydrogen bond donors in the molecule
- NumRotatableBonds: the number of rotatable bonds in the molecule
- TPSA: the topological polar surface area of the molecule
- MolFormula: the molecular formula of the molecule
- Lipinski: a boolean value that indicates whether the molecule satisfies the Lipinski's rule of five or not. The rule of five states that most drug-like molecules have molecular weight less than 500, LogP less than 5, number of hydrogen bond acceptors less than 10, and number of hydrogen bond donors less than 5.
- Veber: a boolean value that indicates whether the molecule satisfies the Veber's rule or not. The rule states that most orally active drugs have 10 or fewer rotatable bonds and a polar surface area equal to or less than 140 Å2.
- Brenk: a string that contains the names of the Brenk filters that the molecule matches, separated by semicolons. The Brenk filters are a set of 68 unwanted substructures that are associated with reactive fucntional groups.
- PAINS: a string that contains the names of the PAINS filters that the molecule matches, separated by semicolons. The PAINS filters are a set of 480 substructures that are associated with pan-assay interference compounds.
"""
brenk_params = FilterCatalogParams()
brenk_params.AddCatalog(FilterCatalogParams.FilterCatalogs.BRENK)
brenk = FilterCatalog(brenk_params)
pains_params = FilterCatalogParams()
pains_params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
pains = FilterCatalog(pains_params)
data_frame = copy.deepcopy(data_frame)
for index in data_frame.index:
mol = Chem.MolFromSmiles(data_frame[smiles_col][index])
if mol:
data_frame.loc[index, "MolWt"] = round(Descriptors.MolWt(mol), 3)
data_frame.loc[index, "LogP"] = round(Descriptors.MolLogP(mol), 3)
data_frame.loc[index, "NumHAcceptors"] = Descriptors.NumHAcceptors(mol)
data_frame.loc[index, "NumHDonors"] = Descriptors.NumHDonors(mol)
data_frame.loc[index, "NumRotatableBonds"] = Descriptors.NumRotatableBonds(
mol
)
data_frame.loc[index, "TPSA"] = Descriptors.TPSA(mol)
data_frame.loc[index, "MolFormula"] = AllChem.CalcMolFormula(mol)
if data_frame["NumHDonors"][index] > 5 or data_frame["NumHAcceptors"][index] > 10 or data_frame["MolWt"][index] > 500 or data_frame["LogP"][index] > 5 :
data_frame.loc[index,"Lipinski"]=False
else:
data_frame.loc[index,"Lipinski"]=True
if data_frame["NumHDonors"][index] > 5 or data_frame["NumHAcceptors"][index] > 10 or data_frame["MolWt"][index] > 500 or data_frame["LogP"][index] > 5 or data_frame["NumRotatableBonds"][index] > 10 or data_frame["TPSA"][index] > 140:
data_frame.loc[index,"Veber"]=False
else:
data_frame.loc[index,"Veber"]=True
brenk_results=brenk.GetMatches(mol)
pains_results=pains.GetMatches(mol)
if brenk_results:
data_frame.loc[index,"Brenk"]=";".join([r.GetDescription() for r in brenk_results])
if pains_results:
data_frame.loc[index,"PAINS"]=";".join([r.GetDescription() for r in pains_results])
return data_frame
[docs]def compute_isotopic_mass(data_frame: pd.DataFrame, molformula_col: str):
"""
Computes the isotopic mass distribution of a given data frame using the pyOpenMS library.
Parameters
----------
data_frame : pd.DataFrame
A pandas data frame that contains the molecular formulas as a column.
molformula_col : str
A string that specifies the name of the column that contains the molecular formulas.
Returns
-------
data_frame : pd.DataFrame
A pandas data frame that has two additional columns: 'probability_sum' and 'mass_distribution'. The 'probability_sum' column contains the sum of the probabilities of all isotopes for each molecular formula. The 'mass_distribution' column contains the mass and probability of each isotope as a string, separated by semicolons.
Example
-------
>>> import pandas as pd
>>> from pyopenms import EmpiricalFormula, CoarseIsotopePatternGenerator
>>> df = pd.DataFrame({'formula': ['C6H12O6', 'C2H4O2', 'C3H8O3']})
>>> df = compute_isotopic_mass(df, 'formula')
>>> print(df)
formula probability_sum mass_distribution
0 C6H12O6 1.0000 180.0634:100.0;181.0668:10.72;182.0701:1.176;183...
1 C2H4O2 1.0000 60.0211:100.0;61.0245:11.08;62.0279:1.216;63.031...
2 C3H8O3 0.9999 92.0473:100.0;93.0507:10.55;94.0541:1.159;95.057...
"""
data_frame = copy.deepcopy(data_frame)
for index in data_frame.index:
try:
molF = EmpiricalFormula(data_frame[molformula_col][index])
isotopes = molF.getIsotopeDistribution(CoarseIsotopePatternGenerator(4))
prob_sum = sum([iso.getIntensity() for iso in isotopes.getContainer()])
distribution = ";".join(
[
f"{round(iso.getMZ(),4)}:{round(iso.getIntensity()*100,4)}"
for iso in isotopes.getContainer()
]
)
sumR = round(prob_sum, 4)
data_frame.loc[index, "probability_sum"] = sumR
data_frame.loc[index, "mass_distribution"] = distribution
except Exception:
pass
return data_frame
[docs]def search_pubchem(data_frame: pd.DataFrame, entry_col: str, entry_type: str = "smiles"):
"""
Searches the PubChem database for compounds that match a given data frame of identifiers.
Parameters
----------
data_frame : pd.DataFrame
A pandas data frame that contains the identifiers of the compounds to search for.
entry_col : str
A string that specifies the name of the column that contains the identifiers.
entry_type : str, optional
A string that specifies the type of the identifiers, such as 'smiles', 'inchi', 'cid', etc. The default is 'smiles'.
Returns
-------
data_frame : pd.DataFrame
The same data frame as input, but with additional columns for each PubChem property retrieved. The properties are:
- PubChem_CID: the PubChem compound identifier, separated by semicolons if there are multiple matches.
- PubChem_SID: the PubChem substance identifier, separated by semicolons if there are multiple matches. Only the first three SIDs are shown.
- PubChem_Synonyms: the synonyms of the compound, separated by semicolons if there are multiple matches.
"""
for index in tqdm(data_frame.index):
try:
matches = pcp.get_compounds(
data_frame[entry_col][index], namespace=entry_type
)
cids = [mat.cid for mat in matches]
sids = [m for mat in matches if mat.sids for m in mat.sids]
synonyms = [m for mat in matches if mat.synonyms for m in mat.synonyms]
data_frame.loc[index, "PubChem_CID"] = ";".join(map(str, cids))
data_frame.loc[index, "PubChem_SID"] = ";".join(map(str, sids[:3]))
data_frame.loc[index, "PubChem_Synonyms"] = ";".join(map(str, synonyms))
except Exception:
pass
return data_frame
[docs]def classify_molecules(data_frame: pd.DataFrame, smiles_col: str, names_col: str):
"""
Classify molecules based on their SMILES strings.
This function submits a query to the ClassyFire web service and returns a data frame with the classification results.
Parameters
----------
data_frame : pd.DataFrame
The input data frame with the molecules information.
smiles_col : str
The name of the column that contains the SMILES strings.
names_col : str
The name of the column that contains the molecule names.
Returns
-------
pd.DataFrame
The output data frame with the classification results added as new columns. The columns are:
- kingdom: the name of the chemical kingdom of the molecule, such as 'Organic compounds', 'Inorganic compounds', etc.
- superclass: the name of the chemical superclass of the molecule, such as 'Lipids and lipid-like molecules', 'Organoheterocyclic compounds', etc.
- class: the name of the chemical class of the molecule, such as 'Steroids and steroid derivatives', 'Benzodiazepines', etc.
- subclass: the name of the chemical subclass of the molecule, such as 'Cholestane steroids', '1,4-benzodiazepines', etc.
Raises
------
requests.exceptions.HTTPError
If the query to the ClassyFire web service fails.
"""
URL = "http://classyfire.wishartlab.com"
def _submit_query(data_frame: pd.DataFrame,smiles_col: str,names_col: str,label: str = "MicrobeRX",query_type:str="STRUCTURE"):
unique_mols = data_frame.drop_duplicates(subset=[names_col, smiles_col])
entries = [
f"{unique_mols[names_col][index]}\t{unique_mols[smiles_col][index]}"
for index in unique_mols.index
]
query_pattern = "\n".join(entries)
try:
q = requests.post(
f"{URL}/queries",
json={
"label": label,
"query_input": query_pattern,
"query_type": query_type,
"fstruc_content_type": "tsv",
},
headers={
"Accept": "application/json",
"Content-Type": "application/json",
},
)
return q.json()["id"]
except requests.exceptions.HTTPError as e:
return e.response
def _get_query(query_id: str, data_format: str = "json"):
try:
r = requests.get(
f"{URL}/queries/{query_id}.json", headers={"Accept": "application/json"}
)
except requests.exceptions.HTTPError as e:
return e.response
return r.json()
data_frame = copy.deepcopy(data_frame)
if len(data_frame.index)>100:
chunks=np.array_split(data_frame, round(len(data_frame.index)/100))
else:
chunks=[data_frame]
job_ids=[]
for c in chunks:
job=_submit_query(c,smiles_col=smiles_col,names_col=names_col)
job_ids.append(job)
time.sleep(10)
results=[]
for result in job_ids:
data=_get_query(str(result))
results.append(pd.DataFrame(data['entities']))
class_data=pd.concat(results,ignore_index=True)
class_data=class_data.transform(lambda x: x.apply(lambda y: y['name'] if isinstance(y,dict) else y))
class_data['alternative_parents']=class_data['alternative_parents'].apply(lambda x: ';'.join([y['name'] for y in x] if isinstance(x,list) else np.nan))
class_data['intermediate_nodes']=class_data['intermediate_nodes'].apply(lambda x: ';'.join([y['name'] for y in x] if isinstance(x,list) else np.nan))
class_data['external_descriptors']=class_data['external_descriptors'].apply(lambda x: ';'.join([f"{y['source']}:{y['source_id']}" for y in x] if isinstance(x,list) else np.nan))
class_data=class_data.transform(lambda x: x.apply(lambda y: ';'.join(y) if isinstance(y,list) else y))
data_frame_classification=data_frame.merge(class_data,left_on=names_col,right_on='identifier', how='outer')
return data_frame_classification