Source code for bnol.workflows

import sys
import numpy as np, pandas as pd, grequests, shelve
import logging
from collections import deque
from . import information

[docs]class PandasInformativeGenes(object):
    """Binary comparison of sub-classes of specimens. In the case of multiple specimen classes, expected use is in a one-vs-rest fashion.

    Provides standardized access to means of ranking genes e.g. through entropy improvement via :py:class:`~bnol.information.Discretize`.

    Args:
        specimens (pandas.DataFrame): shape (n,p) where the n index values constitute the specimens and the p columns the genes.
        classes (numpy.ndarray): shape (n,) class designation for each of the n specimens.

    Raises:
        Exception: if there is not exactly one class designation for each specimen.
    """
    def __init__(self, specimens, classes):
        self.specimens = np.asarray(specimens)
        self.classes = classes

        if not self.specimens.shape[0]==len(self.classes):
            raise Exception("A single class designation must be provided for each specimen.")

        self.genes = specimens.columns.values
        self.specimenLabels = specimens.index
        self.Discrete = information.Discretize()
        logging.debug("PandasInformativeGenes initialized successfully")

    def _classOverExpressionRatio(self, discretizedFeatures, overExpressionClass):
        """Convenience wrapper for the mean number of times that each gene is over-expressed, i.e. marked True, in the subset of genes passed defined by booleanClasses==True."""
        inClass = [discretizedFeatures[idx] for (idx, c) in enumerate(self.classes) if c==overExpressionClass]
        return np.mean(inClass, axis=0)

[docs]    def informativeGenes(self, allGenes=False):
        """Determine genes considered to be informative by means of decreasing class entropy through. Convenience wrapper for :py:class:`~bnol.information.Discretize`.

        Will additionally provide an optimal threshold for determining over- vs under-expression between the two classes.
        The proportion of specimens in each class, considered over-expressed by this threshold, will also be provided.

        Args:
            allGenes (bool): as it says on the box if True else only return those genes for which the entropy gain is greater than the MDLP criterion.

        Returns:
            pandas.DataFrame: details regarding genes, ranked in descending order of amount for which entropy gain exceeds MDLP criterion.
        """
        D = self.Discrete # convenience
        logging.debug("Starting feature discretization in PandasInformativeGenes")
        D.fit(self.specimens, self.classes)
        logging.debug("Finished feature discretization in PandasInformativeGenes")

        # determine a set of gene indices, ranked by decreasing entropy gain above MDLP criterion for said genes
        # the approach is not optimal in that we sort all p genes and then take a subset rather than taking the subset first
        # however, this is only run once, is very quick when tested with 22k+ genes and makes the code easier to read
        # simply sort and then take first p` where p` = p if allGenes else the total number of genes that are informative
        numGenesToInclude = len(self.genes) if allGenes else np.sum(self.Discrete.includeFeatures)
        gainAboveMDLP = self.Discrete.gains - self.Discrete.mdlpCriteria
        rankedGeneIndices = list(reversed(np.argsort(gainAboveMDLP)))[:numGenesToInclude]

        # for each of gene classes, calculate the proportion of specimens that show over-expression relative to the threshold
        (__, indices) = np.unique(self.classes, return_index=True)
        indices.sort()
        uniqueClasses = [self.classes[idx] for idx in indices] # note that this maintains the order as seen in the original set of classes

        orderedFeatures = D.discretizedFeatures[:,rankedGeneIndices]
        overExpression = [self._classOverExpressionRatio(orderedFeatures, c) for c in uniqueClasses]
        assert overExpression[0].shape==(len(rankedGeneIndices),), "Calculated over-expression ratio along incorrect axis"

        logging.debug("Included features determined, building DataFrame in PandasInformativeGenes")
        return pd.DataFrame(
            data = np.vstack((
                    D.includeFeatures[rankedGeneIndices],
                    D.gains[rankedGeneIndices],
                    D.mdlpCriteria[rankedGeneIndices],
                    D.bestThresholds[rankedGeneIndices],
                    np.vstack(tuple(overExpression))
                )).T,
            index=self.genes[rankedGeneIndices],
            columns=['Informative', 'Gain', 'MDLP-Criterion', 'Threshold'] + ["OverExpressed-%s" % c for c in uniqueClasses],
            dtype='float',
            copy=True,
        )

[docs]class CuffnormReader(object):
    """Class to abstract conversion of cuffnorm output to Pandas DataFrame."""
    @staticmethod
[docs]    def getSpecimens(cuffnormOutputPath):
        logging.info("Fetching cuffnorm output from: %s" % cuffnormOutputPath)
        return pd.read_csv(cuffnormOutputPath, delimiter='\t', index_col=0).T

[docs]class CuffnormInformativeGenes(PandasInformativeGenes, CuffnormReader):
    """Convenience wrapper for :py:class:`PandasInformativeGenes` to automatically load data from cuffnorm gene and FPKM counts.

    Args:
        cuffnormOutputPath (string): path to cuffnorm output
        classes (numpy.ndarray): shape (n,) defining categorical designation of each specimen present in cuffnormOutputPath
    """
    def __init__(self, cuffnormOutputPath, classes):
        specimens = self.getSpecimens(cuffnormOutputPath)
        logging.debug("CuffnormOveVsRest handing over to parent class PandasInformativeGenes")
        super(CuffnormInformativeGenes, self).__init__(specimens, classes)

[docs]class PandasMultiClass(PandasInformativeGenes):
    """Convenience wrapper to run :py:class:`PandasInformativeGenes` analyses on multi-class data by running c different one-vs-rest analyses, where c is the total number of classes.

    Args:
        specimens (pandas.DataFrame): shape (n,p) where the n index values constitute the specimens and the p columns the genes.
        multiClasses (list): length (n) class designation for each of the n specimens; can include any type that may be used as a dict key.
    """
    def __init__(self, specimens, multiClasses):
        self.multiClasses = multiClasses
        super(PandasMultiClass, self).__init__(specimens, multiClasses) # will modify classes for each run if doing oneVsRest, but need something in there for now and this will check the size of the array

[docs]    def informativeGenes(self, allGenes=False):
        """See :py:class:`bnol.workflows.PandasInformativeGenes.informativeGenes` for base behaviour, repeated for each class.

        Args:
            allGenes (bool): as it says on the box if True else only return those genes for which the entropy gain is greater than the MDLP criterion.

        Returns:
            dict: pandas.DataFrame: each entry being a pandas.DataFrame object returned by one-vs-rest analysis.
        """
        uniqueClasses = np.unique(self.multiClasses)
        allComparisons = dict()
        logging.info("Performing one-vs-rest multi-class comparison of %d classes in PandasMultiClass" % len(uniqueClasses))
        for c in uniqueClasses:
            logging.debug("Starting analysis of class: %s" % c)
            self.classes = np.asarray(self._getBinaryClasses(self.multiClasses, c))
            allComparisons[c] = super(PandasMultiClass, self).informativeGenes(allGenes)
            logging.debug("Finished analysis of class: %s" % c)
        return allComparisons

    @staticmethod
    def _getBinaryClasses(multiClasses, trueClass):
        notTrueClass = "not-%s" % trueClass
        return [(c if c==trueClass else notTrueClass) for c in multiClasses]

[docs]class CuffnormMultiClass(PandasMultiClass, CuffnormReader):
    """Convenience wrapper for :py:class:`PandasMultiClass` to automatically load data from cuffnorm gene and FPKM counts.

    Args:
        cuffnormOutputPath (string): path to cuffnorm output
        multiClasses (list): length (n) class designation for each of the n specimens present in cuffnormOutputPath; can include any type that may be used as a dict key.
    """
    def __init__(self, cuffnormOutputPath, multiClasses):
        specimens = self.getSpecimens(cuffnormOutputPath)
        super(CuffnormMultiClass, self).__init__(specimens, multiClasses)

ensemblFormats = {'full', 'condensed'} # set {} has faster lookup than list []
[docs]def EnsemblLookup(ensemblIDs, lookupFormat='full', rebuildCache=False):
    """Use the Ensembl REST API 'lookup' to return data corresponding to a particular ID. Will create a local cache.

    http://rest.ensembl.org/documentation/info/lookup

    Args:
        ensemblIDs (list or string): Ensembl IDs in a list; will accept a string.
        lookupFormat (string): One of 'full' or 'condensed' as described in the API documentation.
        rebuildCache (bool): If True, fetch fresh data from Ensembl even if a locally-cached version exists.

    Returns:
        list or dict: The JSON data returned by the API, converted to a dict. Will return a single dict if string ID passed.

    Raises:
        Exception: if an invalid format string is passed.
    """
    if not lookupFormat in ensemblFormats:
        raise Exception("Ensembl lookup can only be 'full' or 'condsensed'")

    if isinstance(ensemblIDs, str):
        ensemblIDs = [ensemblIDs]
        returnAsList = False
    else:
        returnAsList = True
        ensemblIDs = ensemblIDs

    memo = shelve.open("./.ensemblCache-%d" % sys.version_info[0]) # technically don't need different file names, but local automated testing fails when using python3 to open python2 shelves

    getFresh = [rebuildCache or (not idx in memo) or (memo[idx] is None) for idx in ensemblIDs]
    urls = ["https://rest.ensembl.org/lookup/id/%s?content-type=application/json;format=%s" % (idx, lookupFormat) for (i, idx) in enumerate(ensemblIDs) if getFresh[i]]
    rs = (grequests.get(u) for u in urls)
    freshData = deque(grequests.map(rs))

    data = []
    for i, idx in enumerate(ensemblIDs):
        if getFresh[i]:
            memo[idx] = freshData.popleft() # freshData queue is only populated with those we know are not in the memo (or all if rebuildCache)
        data.append(memo[idx])

    memo.close()

    def returnValue(d):
        return d.json() if d is not None else dict()

    if returnAsList:
        return [returnValue(d) for d in data]
    else:
        return returnValue(data[0])