Source code for micone.validation.otu_schema

"""
    Module that defines the schema for a valid OTU table
"""

from biom import Table
import numpy as np
from schematics.exceptions import ValidationError
from schematics.types import BaseType


[docs]class HeaderType(BaseType):
    """
    DataType that describes the expected structure and format for the sample headers
    """

[docs]    def validate_header(self, value):
        """ Check whether the header is valid """
        if any(not isinstance(v, str) for v in value):
            raise ValidationError("Invalid header. All samples must be strings")


[docs]class IndexType(BaseType):
    """ DataType that describes the expected structure and format for the OTU indices """

[docs]    def validate_index_str(self, value):
        if any(not isinstance(v, str) for v in value):
            raise ValidationError("Invalid index. All indices must be strings")

[docs]    def validate_index_unique(self, value):
        if len(value) != len(set(value)):
            raise ValidationError("Invalid index. All indices must be unqiue")


[docs]class DataType(BaseType):
    """" DataType that describes the expected structure and format for abundance values """

    def __init__(self, norm, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.norm = norm

[docs]    def validate_data_npfloat(self, value):
        if not value.dtype == "float64":
            raise ValidationError("Invalid data. Abundances must be float64")

[docs]    def validate_data_range(self, value):
        df = value.to_dataframe(dense=True)
        if df.values.min() < 0:
            raise ValidationError("Invalid data. Abundances cannot be negative")
        if self.norm:
            if df.values.max() > 1 or df.values.min() < 0:
                raise ValidationError("Invalid data. Abundances are not normalized")
            if any(not np.isclose(v, 1.0) for v in df.values.sum(axis=0)):
                raise ValidationError("Invalid data. Abundances are not normalized")


[docs]class SamplemetaType(BaseType):
    """ DataType that describes the expected structure and format for the sample metadata """

[docs]    def validate_samplemeta_columns(self, value):
        if len(value.columns) < 1:
            raise ValidationError("Invalid columns in sample metdata")

[docs]    def validate_samplemeta_index(self, value):
        if len(value.index) != len(set(value.index)):
            raise ValidationError(
                "Invalid index in sample metadata. All indices must be unqiue"
            )

[docs]    def validate_structure(self, value):
        if any(not isinstance(v, str) for v in value.index):
            raise ValidationError("Invalid index. All indices must be strings")
        if value.index.str.startswith("#").any():
            raise ValidationError(
                "Invalid sample metadata structure. Possibly incorrect header"
            )


[docs]class ObsmetaType(BaseType):
    """ DataType that describes the expected structure and format for the observation metadata """

    _req_keys = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
    _extra_keys = ["Confidence", "Abundance"]

[docs]    def validate_index(self, value):
        if any(not isinstance(v, str) for v in value.index):
            raise ValidationError("Invalid index. All indices must be strings")

[docs]    def validate_obsmeta_headers(self, value):
        for col in value.columns:
            if col not in self._req_keys and col not in self._extra_keys:
                raise ValidationError(
                    f"Invalid observation metadata. Unknown attribute {col} present"
                )
        # Check if keys are in order
        # i.e. if genus is present everything above that level is present
        for key in self._req_keys:
            if key not in value.columns:
                ind = self._req_keys.index(key)
                if len(value.columns) != ind:
                    raise ValidationError(
                        f"Invalid observation metadata. Required attribute {key} not present"
                    )
                else:
                    break

[docs]    def validate_obsmeta_data(self, value):
        if self._extra_keys[0] in value.columns:
            confidence = value[self._extra_keys[0]]
            if confidence.dtype == float:
                cond1 = 0 <= confidence.min() <= 1
                cond2 = 0 <= confidence.max() <= 1
                if not cond1 and not cond2:
                    raise ValidationError(
                        "Invalid observation metadata. "
                        f"{self._extra_keys[0]} must have a value between 0 and 1"
                    )
            else:
                raise ValidationError(
                    "Invalid observation metadata. "
                    f"{self._extra_keys[0]} column must be of type float"
                )
            df = value.drop(self._extra_keys[0], axis=1)
        else:
            df = value
        if self._extra_keys[1] in value.columns:
            abundance = value[self._extra_keys[1]]
            if abundance.dtype != float:
                raise ValidationError(
                    "Invalid observation metadata. "
                    f"{self._extra_keys[1]} column must be of type float"
                )
            df = value.drop(self._extra_keys[1], axis=1)
        else:
            df = value
        for level, data in df.items():
            filt_data = data[data != ""]
            if level == "Species":
                query = filt_data[
                    ~filt_data.str.contains(r"^[a-zA-Z0-9-._ ]+(?<! )$")
                ].any()
                # TODO: FIXME:
                # if query:
                #     raise ValidationError(
                #         "Invalid observation metadata. "
                #         f"Taxonomy names are not standard: {query} is not allowed in {level}"
                #     )
            elif level in ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus"]:
                query = filt_data[
                    ~filt_data.str.contains(r"^[a-zA-Z0-9-._ ]+(?<! )$")
                ].any()
                # TODO: FIXME:
                # if query:
                #     raise ValidationError(
                #         "Invalid observation metadata. "
                #         f"Taxonomy names are not standard: {query} is not allowed in {level}"
                #     )


[docs]class BiomType(BaseType):
    """
    DataType that describes the expected structure and format for the `biom.Table`

    Parameters
    ----------
    norm : bool, optional
        True if abundances are normalized
        Default value is False
    """

    def __init__(self, norm=False, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.norm = norm

[docs]    def validate_istable(self, value):
        """ Check whether the object is a `biom.Table` """
        if not isinstance(value, Table):
            raise ValidationError("Object must be a `biom.Table` instance")

[docs]    def validate_samples(self, value):
        """ Check whether the samples (columns) of the Table are valid """
        header_type = HeaderType()
        header_type.validate(value.ids(axis="sample"))

[docs]    def validate_index(self, value):
        """ Check whether the indices in the Table are valid """
        index_type = IndexType()
        index_type.validate(value.ids(axis="observation"))

[docs]    def validate_data(self, value):
        """ Check whether the data in the Table is valid """
        data_type = DataType(self.norm)
        data_type.validate(value)

[docs]    def validate_sample_metadata(self, value):
        """ Check whether the sample metadata in the Table is valid """
        samplemeta_type = SamplemetaType()
        sample_metadata = value.metadata_to_dataframe("sample")
        samplemeta_type.validate(sample_metadata)

[docs]    def validate_obs_metadata(self, value):
        """ Check whether the observation metadata in the Table is valid """
        obsmeta_type = ObsmetaType()
        obs_metadata = value.metadata_to_dataframe("observation")
        obsmeta_type.validate(obs_metadata)