Source code for microberx.DataFiles

"""
This is a module that provides functions to load and process data for the MicrobeRX tool.

The module requires the following packages: pandas, importlib_resources

The module contains the following functions:

- load_reaction_rules: Load the reaction rules from a compressed tab-separated file.
- load_human_evidences: Load the human evidences from a compressed tab-separated file.
- load_microbes_evidences: Load the microbes evidences from a compressed tab-separated file.
- load_microbes_reactions: Load the microbes reactions from a compressed tab-separated file.
- load_microbes_data: Load the microbes data from a compressed tab-separated file.
"""

__all__ = [
    "load_reaction_rules",
    "load_evidences",
    "load_microbes_reactions",
    "load_microbes_data",
]


from importlib_resources import files
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")


__REACTION_RULES = files("microberx.DataBase").joinpath("ReactionRules.tsv.gz")
__EVIDENCES = files("microberx.DataBase").joinpath("Evidences.tsv.gz")
__MICROBES_DATA = files("microberx.DataBase").joinpath("MicrobesData.tsv.gz")
__MICROBES_REACTIONS = files("microberx.DataBase").joinpath("MicrobesReactions.tsv.gz")


[docs]def load_reaction_rules(): """ Load the reaction rules from a compressed tab-separated file. Returns: pandas.DataFrame: A dataframe containing the reaction rules, with columns: - num_atoms : Number of atoms to match in the query to perfom a prediction. - rule : SMARTS string of the single reactant reaction rule (SRRR). - reaction_id : Reaction_id in unified MetaNetX v4.0 id or AGORA2. - substrate : MetaNetX id of the Real subtrate of the SRRR. - substrate_map : Atom mappeed SMARTS of the of the Real subtrate of the SRRR. - product : MetaNetX id of the Main real subtrate of the SRRR. - product_map : Atom mappeed SMARTS of Main real product of the SRRR. """ logging.info("Loading reaction rules...") return pd.read_csv(__REACTION_RULES, sep="\t", compression="gzip")
[docs]def load_evidences(): """ Load the human evidences from a compressed tab-separated file. Returns: pandas.DataFrame: A dataframe containing the human evidences, with columns: - source : The unique identifier of the source coming from the metabolic reconstruction. - name : Name of the biotransformations, can match with enzyme name. - ec : Enzyme Commission number for the biotransformation. - mnx_id : Unified id from MetaNetX v4.0. - organisms_count : Number of organims where this souce id has been found. - xrefs : coss-references to other reaction databases. - origin : Tells if the reaction is coming from human or gut microbes. - complexes_count : Numer of genes or complexes found in the metabolic network for this biotransformation. """ logging.info("Loading evidences...") return pd.read_csv(__EVIDENCES, sep="\t", compression="gzip")
[docs]def load_microbes_reactions(): """ Load the microbes reactions from a compressed tab-separated file. Returns: pandas.DataFrame: A dataframe containing the microbes reactions. - index: strain name of all gut microbes included in microbeRX (source: AGORA2). - columns : source name of biotransformation from the metabolic reconstructions. - data : any cell contains information about the genes or complexes that have been annotated for each organims and biotransformation. """ logging.info("Loading microbes reactions...") return pd.read_csv( __MICROBES_REACTIONS, sep="\t", index_col=[0], compression="gzip", dtype=str )
[docs]def load_microbes_data(): """ Load the microbes data from a compressed tab-separated file. Returns: pandas.DataFrame: A dataframe containing the microbes data, with columns: - microbe_name - Strain - Species - Genus - Family - Order - Class - Phylum - Kingdom - Host - NCBI Taxonomy ID - Cultured - Ecosystem - Ecosystem Category - Ecosystem Subtype - Ecosystem Type - Gram Staining - Oxygen Requirement - Motility """ logging.info("Loading microbes data...") return pd.read_csv(__MICROBES_DATA, sep="\t", compression="gzip")