Source code for micone.validation.otu_schema

"""
    Module that defines the schema for a valid OTU table
"""

from biom import Table
import numpy as np
from schematics.exceptions import ValidationError
from schematics.types import BaseType


[docs]class HeaderType(BaseType): """ DataType that describes the expected structure and format for the sample headers """
[docs] def validate_header(self, value): """ Check whether the header is valid """ if any(not isinstance(v, str) for v in value): raise ValidationError("Invalid header. All samples must be strings")
[docs]class IndexType(BaseType): """ DataType that describes the expected structure and format for the OTU indices """
[docs] def validate_index_str(self, value): if any(not isinstance(v, str) for v in value): raise ValidationError("Invalid index. All indices must be strings")
[docs] def validate_index_unique(self, value): if len(value) != len(set(value)): raise ValidationError("Invalid index. All indices must be unqiue")
[docs]class DataType(BaseType): """" DataType that describes the expected structure and format for abundance values """ def __init__(self, norm, *args, **kwargs): super().__init__(*args, **kwargs) self.norm = norm
[docs] def validate_data_npfloat(self, value): if not value.dtype == "float64": raise ValidationError("Invalid data. Abundances must be float64")
[docs] def validate_data_range(self, value): df = value.to_dataframe(dense=True) if df.values.min() < 0: raise ValidationError("Invalid data. Abundances cannot be negative") if self.norm: if df.values.max() > 1 or df.values.min() < 0: raise ValidationError("Invalid data. Abundances are not normalized") if any(not np.isclose(v, 1.0) for v in df.values.sum(axis=0)): raise ValidationError("Invalid data. Abundances are not normalized")
[docs]class SamplemetaType(BaseType): """ DataType that describes the expected structure and format for the sample metadata """
[docs] def validate_samplemeta_columns(self, value): if len(value.columns) < 1: raise ValidationError("Invalid columns in sample metdata")
[docs] def validate_samplemeta_index(self, value): if len(value.index) != len(set(value.index)): raise ValidationError( "Invalid index in sample metadata. All indices must be unqiue" )
[docs] def validate_structure(self, value): if any(not isinstance(v, str) for v in value.index): raise ValidationError("Invalid index. All indices must be strings") if value.index.str.startswith("#").any(): raise ValidationError( "Invalid sample metadata structure. Possibly incorrect header" )
[docs]class ObsmetaType(BaseType): """ DataType that describes the expected structure and format for the observation metadata """ _req_keys = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"] _extra_keys = ["Confidence", "Abundance"]
[docs] def validate_index(self, value): if any(not isinstance(v, str) for v in value.index): raise ValidationError("Invalid index. All indices must be strings")
[docs] def validate_obsmeta_headers(self, value): for col in value.columns: if col not in self._req_keys and col not in self._extra_keys: raise ValidationError( f"Invalid observation metadata. Unknown attribute {col} present" ) # Check if keys are in order # i.e. if genus is present everything above that level is present for key in self._req_keys: if key not in value.columns: ind = self._req_keys.index(key) if len(value.columns) != ind: raise ValidationError( f"Invalid observation metadata. Required attribute {key} not present" ) else: break
[docs] def validate_obsmeta_data(self, value): if self._extra_keys[0] in value.columns: confidence = value[self._extra_keys[0]] if confidence.dtype == float: cond1 = 0 <= confidence.min() <= 1 cond2 = 0 <= confidence.max() <= 1 if not cond1 and not cond2: raise ValidationError( "Invalid observation metadata. " f"{self._extra_keys[0]} must have a value between 0 and 1" ) else: raise ValidationError( "Invalid observation metadata. " f"{self._extra_keys[0]} column must be of type float" ) df = value.drop(self._extra_keys[0], axis=1) else: df = value if self._extra_keys[1] in value.columns: abundance = value[self._extra_keys[1]] if abundance.dtype != float: raise ValidationError( "Invalid observation metadata. " f"{self._extra_keys[1]} column must be of type float" ) df = value.drop(self._extra_keys[1], axis=1) else: df = value for level, data in df.items(): filt_data = data[data != ""] if level == "Species": query = filt_data[ ~filt_data.str.contains(r"^[a-zA-Z0-9-._ ]+(?<! )$") ].any() # TODO: FIXME: # if query: # raise ValidationError( # "Invalid observation metadata. " # f"Taxonomy names are not standard: {query} is not allowed in {level}" # ) elif level in ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus"]: query = filt_data[ ~filt_data.str.contains(r"^[a-zA-Z0-9-._ ]+(?<! )$") ].any()
# TODO: FIXME: # if query: # raise ValidationError( # "Invalid observation metadata. " # f"Taxonomy names are not standard: {query} is not allowed in {level}" # )
[docs]class BiomType(BaseType): """ DataType that describes the expected structure and format for the `biom.Table` Parameters ---------- norm : bool, optional True if abundances are normalized Default value is False """ def __init__(self, norm=False, *args, **kwargs): super().__init__(*args, **kwargs) self.norm = norm
[docs] def validate_istable(self, value): """ Check whether the object is a `biom.Table` """ if not isinstance(value, Table): raise ValidationError("Object must be a `biom.Table` instance")
[docs] def validate_samples(self, value): """ Check whether the samples (columns) of the Table are valid """ header_type = HeaderType() header_type.validate(value.ids(axis="sample"))
[docs] def validate_index(self, value): """ Check whether the indices in the Table are valid """ index_type = IndexType() index_type.validate(value.ids(axis="observation"))
[docs] def validate_data(self, value): """ Check whether the data in the Table is valid """ data_type = DataType(self.norm) data_type.validate(value)
[docs] def validate_sample_metadata(self, value): """ Check whether the sample metadata in the Table is valid """ samplemeta_type = SamplemetaType() sample_metadata = value.metadata_to_dataframe("sample") samplemeta_type.validate(sample_metadata)
[docs] def validate_obs_metadata(self, value): """ Check whether the observation metadata in the Table is valid """ obsmeta_type = ObsmetaType() obs_metadata = value.metadata_to_dataframe("observation") obsmeta_type.validate(obs_metadata)