Source code for micone.validation.otu_validator

"""
    Module that deals with the validation of an OTU table
"""

import pathlib
from typing import Dict, List, Optional, Union

import pandas as pd
from biom import Table, load_table

from .otu_schema import BiomType, ObsmetaType, SamplemetaType


[docs]class OtuValidator: """ Validates input `OTU` table file and returns the `Otu` instance of the file Parameters ---------- dtype : {'biom', 'tsv'} The type of OtuValidator instance to be created ext : str, optional The extension of the file if other than supported extensions Supported extensions: 'tsv' dtype: 'tsv', 'txt', 'counts' 'biom' dtype: 'biom', 'hdf5' Attributes ---------- validator : BiomType The schmatics validator instance Raises ------ ValidationError If any of the files do not conform to the schema outlines in `otu_schema` Notes ----- We assume that the extension dictates the filetype """ _otu_exts = {"tsv": [".tsv", ".txt", ".counts"], "biom": [".biom", ".hdf5"]} _meta_exts = [".csv", ".tsv"] _tax_exts = [".csv", ".tsv"] def __init__(self, dtype: str, ext: Optional[str] = None) -> None: self._dtype = dtype if dtype not in self._otu_exts.keys(): raise TypeError( f"{dtype} is not supported. Try one of {self._otu_exts.keys()}" ) if ext: self._otu_exts[self._dtype].append(ext) self.validator = BiomType() @property def configuration(self) -> Dict[str, Union[str, List[str]]]: """ Dictionary showing the current configuration of the instance Returns ------- Dict[str, Union[str, List[str]]] """ return { "dtype": self._dtype, "valid_otu_ext": self._otu_exts[self._dtype], "valid_meta_ext": self._meta_exts, "valid_tax_ext": self._tax_exts, } def _validate_ext(self, fpath: pathlib.Path) -> bool: """ Determines whether the filetype is supported Parameters ---------- fpath : pathlib.Path Returns ------- bool """ exts = self._otu_exts[self._dtype] return bool(fpath.suffix in exts) def _load_from_biom(self, otu_file: pathlib.Path) -> Table: """ Read biom table from file Parameters ---------- otu_file : pathlib.Path The path to the OTU file in `biom` format Returns ------- Table A `biom.Table` instance containing the OTU, meta, tax data """ otudata = load_table(otu_file) self.validator.validate(otudata) return otudata @staticmethod def _extract_data(data_file: pathlib.Path, valid_exts: List[str]) -> pd.DataFrame: """ Extract data as a `pd.DataFrame` from file Parameters ---------- data_file : pathlib.Path The path to the data file valid_exts : List[str] A list of valid extensions Returns ------- pd.DataFrame `pd.DataFrame` created from the data_file """ ext = data_file.suffix if ext in valid_exts: if ext == "tsv": data = pd.read_table(data_file, sep="\t", index_col=0, na_filter=False) elif ext == "csv": data = pd.read_csv(data_file, sep=",", index_col=0, na_filter=False) else: data = pd.read_csv( data_file, sep=None, engine="python", index_col=0, na_filter=False ) else: raise TypeError( "The input metadata file type is not supported. " f"Valid extensions are {valid_exts}" ) return data def _load_from_tsv( self, otu_file: pathlib.Path, meta_file: pathlib.Path, tax_file: pathlib.Path ) -> Table: """ Read OTU counts file to biom table and add metadata and taxonomy data Parameters ---------- otu_file : pathlib.Path The path to the tsv file containing the OTU counts table meta_file : pathlib.Path The path to the csv file containing the metadata information tax_file : pathlib.Path The path to the csv file containing the taxonomy information Returns ------- Table A `biom.Table` instance containing the OTU, meta, tax data """ otudata = load_table(otu_file) metadata = self._extract_data(meta_file, self._meta_exts) metadata.index = metadata.index.astype(str) samplemeta_type = SamplemetaType() samplemeta_type.validate(metadata) taxdata = self._extract_data(tax_file, self._tax_exts) taxdata.index = taxdata.index.astype(str) obsmeta_type = ObsmetaType() obsmeta_type.validate(taxdata) otudata.add_metadata(metadata.to_dict(orient="index"), axis="sample") otudata.add_metadata(taxdata.to_dict(orient="index"), axis="observation") self.validator.validate(otudata) return otudata
[docs] def load_validate( self, otu_file: pathlib.Path, meta_file: Optional[pathlib.Path] = None, tax_file: Optional[pathlib.Path] = None, ) -> Table: """ Load the data and validate Parameters ---------- otu_file : pathlib.Path The path to the `OTU` counts table meta_file : pathlib.Path, optional The path to the sample metadata file This argument is required if `dtype` is 'tsv' tax_file : pathlib.Path, optional The path to the taxonomy file This argument is required if `dtype` is 'tsv' Returns ------- Table `biom.Table` containing all the data """ err_msg = ( "The input OTU file type is not supported. " f"Valid extensions are {self._otu_exts[self._dtype]}" ) if self._dtype == "biom": if self._validate_ext(otu_file): otu_table = self._load_from_biom(otu_file) else: raise ValueError(err_msg) elif self._dtype == "tsv": if meta_file and tax_file: if self._validate_ext(otu_file): otu_table = self._load_from_tsv(otu_file, meta_file, tax_file) else: raise TypeError(err_msg) else: raise ValueError("Missing metadata or taxonomy data") return otu_table