"""
This is a module that provides functions to load and process data for the MicrobeRX tool.
The module requires the following packages: pandas, importlib_resources
The module contains the following functions:
- load_reaction_rules: Load the reaction rules from a compressed tab-separated file.
- load_human_evidences: Load the human evidences from a compressed tab-separated file.
- load_microbes_evidences: Load the microbes evidences from a compressed tab-separated file.
- load_microbes_reactions: Load the microbes reactions from a compressed tab-separated file.
- load_microbes_data: Load the microbes data from a compressed tab-separated file.
"""
__all__ = [
"load_reaction_rules",
"load_evidences",
"load_microbes_reactions",
"load_microbes_data",
]
from importlib_resources import files
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
__REACTION_RULES = files("microberx.DataBase").joinpath("ReactionRules.tsv.gz")
__EVIDENCES = files("microberx.DataBase").joinpath("Evidences.tsv.gz")
__MICROBES_DATA = files("microberx.DataBase").joinpath("MicrobesData.tsv.gz")
__MICROBES_REACTIONS = files("microberx.DataBase").joinpath("MicrobesReactions.tsv.gz")
[docs]def load_reaction_rules():
"""
Load the reaction rules from a compressed tab-separated file.
Returns:
pandas.DataFrame: A dataframe containing the reaction rules, with columns:
- num_atoms : Number of atoms to match in the query to perfom a prediction.
- rule : SMARTS string of the single reactant reaction rule (SRRR).
- reaction_id : Reaction_id in unified MetaNetX v4.0 id or AGORA2.
- substrate : MetaNetX id of the Real subtrate of the SRRR.
- substrate_map : Atom mappeed SMARTS of the of the Real subtrate of the SRRR.
- product : MetaNetX id of the Main real subtrate of the SRRR.
- product_map : Atom mappeed SMARTS of Main real product of the SRRR.
"""
logging.info("Loading reaction rules...")
return pd.read_csv(__REACTION_RULES, sep="\t", compression="gzip")
[docs]def load_evidences():
"""
Load the human evidences from a compressed tab-separated file.
Returns:
pandas.DataFrame: A dataframe containing the human evidences, with columns:
- source : The unique identifier of the source coming from the metabolic reconstruction.
- name : Name of the biotransformations, can match with enzyme name.
- ec : Enzyme Commission number for the biotransformation.
- mnx_id : Unified id from MetaNetX v4.0.
- organisms_count : Number of organims where this souce id has been found.
- xrefs : coss-references to other reaction databases.
- origin : Tells if the reaction is coming from human or gut microbes.
- complexes_count : Numer of genes or complexes found in the metabolic network for this biotransformation.
"""
logging.info("Loading evidences...")
return pd.read_csv(__EVIDENCES, sep="\t", compression="gzip")
[docs]def load_microbes_reactions():
"""
Load the microbes reactions from a compressed tab-separated file.
Returns:
pandas.DataFrame: A dataframe containing the microbes reactions.
- index: strain name of all gut microbes included in microbeRX (source: AGORA2).
- columns : source name of biotransformation from the metabolic reconstructions.
- data : any cell contains information about the genes or complexes that have been annotated for each organims and biotransformation.
"""
logging.info("Loading microbes reactions...")
return pd.read_csv(
__MICROBES_REACTIONS, sep="\t", index_col=[0], compression="gzip", dtype=str
)
[docs]def load_microbes_data():
"""
Load the microbes data from a compressed tab-separated file.
Returns:
pandas.DataFrame: A dataframe containing the microbes data, with columns:
- microbe_name
- Strain
- Species
- Genus
- Family
- Order
- Class
- Phylum
- Kingdom
- Host
- NCBI Taxonomy ID
- Cultured
- Ecosystem
- Ecosystem Category
- Ecosystem Subtype
- Ecosystem Type
- Gram Staining
- Oxygen Requirement
- Motility
"""
logging.info("Loading microbes data...")
return pd.read_csv(__MICROBES_DATA, sep="\t", compression="gzip")