Source code for bnol.workflows
import sys
import numpy as np, pandas as pd, grequests, shelve
import logging
from collections import deque
from . import information
[docs]class PandasInformativeGenes(object):
"""Binary comparison of sub-classes of specimens. In the case of multiple specimen classes, expected use is in a one-vs-rest fashion.
Provides standardized access to means of ranking genes e.g. through entropy improvement via :py:class:`~bnol.information.Discretize`.
Args:
specimens (pandas.DataFrame): shape (n,p) where the n index values constitute the specimens and the p columns the genes.
classes (numpy.ndarray): shape (n,) class designation for each of the n specimens.
Raises:
Exception: if there is not exactly one class designation for each specimen.
"""
def __init__(self, specimens, classes):
self.specimens = np.asarray(specimens)
self.classes = classes
if not self.specimens.shape[0]==len(self.classes):
raise Exception("A single class designation must be provided for each specimen.")
self.genes = specimens.columns.values
self.specimenLabels = specimens.index
self.Discrete = information.Discretize()
logging.debug("PandasInformativeGenes initialized successfully")
def _classOverExpressionRatio(self, discretizedFeatures, overExpressionClass):
"""Convenience wrapper for the mean number of times that each gene is over-expressed, i.e. marked True, in the subset of genes passed defined by booleanClasses==True."""
inClass = [discretizedFeatures[idx] for (idx, c) in enumerate(self.classes) if c==overExpressionClass]
return np.mean(inClass, axis=0)
[docs] def informativeGenes(self, allGenes=False):
"""Determine genes considered to be informative by means of decreasing class entropy through. Convenience wrapper for :py:class:`~bnol.information.Discretize`.
Will additionally provide an optimal threshold for determining over- vs under-expression between the two classes.
The proportion of specimens in each class, considered over-expressed by this threshold, will also be provided.
Args:
allGenes (bool): as it says on the box if True else only return those genes for which the entropy gain is greater than the MDLP criterion.
Returns:
pandas.DataFrame: details regarding genes, ranked in descending order of amount for which entropy gain exceeds MDLP criterion.
"""
D = self.Discrete # convenience
logging.debug("Starting feature discretization in PandasInformativeGenes")
D.fit(self.specimens, self.classes)
logging.debug("Finished feature discretization in PandasInformativeGenes")
# determine a set of gene indices, ranked by decreasing entropy gain above MDLP criterion for said genes
# the approach is not optimal in that we sort all p genes and then take a subset rather than taking the subset first
# however, this is only run once, is very quick when tested with 22k+ genes and makes the code easier to read
# simply sort and then take first p` where p` = p if allGenes else the total number of genes that are informative
numGenesToInclude = len(self.genes) if allGenes else np.sum(self.Discrete.includeFeatures)
gainAboveMDLP = self.Discrete.gains - self.Discrete.mdlpCriteria
rankedGeneIndices = list(reversed(np.argsort(gainAboveMDLP)))[:numGenesToInclude]
# for each of gene classes, calculate the proportion of specimens that show over-expression relative to the threshold
(__, indices) = np.unique(self.classes, return_index=True)
indices.sort()
uniqueClasses = [self.classes[idx] for idx in indices] # note that this maintains the order as seen in the original set of classes
orderedFeatures = D.discretizedFeatures[:,rankedGeneIndices]
overExpression = [self._classOverExpressionRatio(orderedFeatures, c) for c in uniqueClasses]
assert overExpression[0].shape==(len(rankedGeneIndices),), "Calculated over-expression ratio along incorrect axis"
logging.debug("Included features determined, building DataFrame in PandasInformativeGenes")
return pd.DataFrame(
data = np.vstack((
D.includeFeatures[rankedGeneIndices],
D.gains[rankedGeneIndices],
D.mdlpCriteria[rankedGeneIndices],
D.bestThresholds[rankedGeneIndices],
np.vstack(tuple(overExpression))
)).T,
index=self.genes[rankedGeneIndices],
columns=['Informative', 'Gain', 'MDLP-Criterion', 'Threshold'] + ["OverExpressed-%s" % c for c in uniqueClasses],
dtype='float',
copy=True,
)
[docs]class CuffnormReader(object):
"""Class to abstract conversion of cuffnorm output to Pandas DataFrame."""
@staticmethod
[docs] def getSpecimens(cuffnormOutputPath):
logging.info("Fetching cuffnorm output from: %s" % cuffnormOutputPath)
return pd.read_csv(cuffnormOutputPath, delimiter='\t', index_col=0).T
[docs]class CuffnormInformativeGenes(PandasInformativeGenes, CuffnormReader):
"""Convenience wrapper for :py:class:`PandasInformativeGenes` to automatically load data from cuffnorm gene and FPKM counts.
Args:
cuffnormOutputPath (string): path to cuffnorm output
classes (numpy.ndarray): shape (n,) defining categorical designation of each specimen present in cuffnormOutputPath
"""
def __init__(self, cuffnormOutputPath, classes):
specimens = self.getSpecimens(cuffnormOutputPath)
logging.debug("CuffnormOveVsRest handing over to parent class PandasInformativeGenes")
super(CuffnormInformativeGenes, self).__init__(specimens, classes)
[docs]class PandasMultiClass(PandasInformativeGenes):
"""Convenience wrapper to run :py:class:`PandasInformativeGenes` analyses on multi-class data by running c different one-vs-rest analyses, where c is the total number of classes.
Args:
specimens (pandas.DataFrame): shape (n,p) where the n index values constitute the specimens and the p columns the genes.
multiClasses (list): length (n) class designation for each of the n specimens; can include any type that may be used as a dict key.
"""
def __init__(self, specimens, multiClasses):
self.multiClasses = multiClasses
super(PandasMultiClass, self).__init__(specimens, multiClasses) # will modify classes for each run if doing oneVsRest, but need something in there for now and this will check the size of the array
[docs] def informativeGenes(self, allGenes=False):
"""See :py:class:`bnol.workflows.PandasInformativeGenes.informativeGenes` for base behaviour, repeated for each class.
Args:
allGenes (bool): as it says on the box if True else only return those genes for which the entropy gain is greater than the MDLP criterion.
Returns:
dict: pandas.DataFrame: each entry being a pandas.DataFrame object returned by one-vs-rest analysis.
"""
uniqueClasses = np.unique(self.multiClasses)
allComparisons = dict()
logging.info("Performing one-vs-rest multi-class comparison of %d classes in PandasMultiClass" % len(uniqueClasses))
for c in uniqueClasses:
logging.debug("Starting analysis of class: %s" % c)
self.classes = np.asarray(self._getBinaryClasses(self.multiClasses, c))
allComparisons[c] = super(PandasMultiClass, self).informativeGenes(allGenes)
logging.debug("Finished analysis of class: %s" % c)
return allComparisons
@staticmethod
def _getBinaryClasses(multiClasses, trueClass):
notTrueClass = "not-%s" % trueClass
return [(c if c==trueClass else notTrueClass) for c in multiClasses]
[docs]class CuffnormMultiClass(PandasMultiClass, CuffnormReader):
"""Convenience wrapper for :py:class:`PandasMultiClass` to automatically load data from cuffnorm gene and FPKM counts.
Args:
cuffnormOutputPath (string): path to cuffnorm output
multiClasses (list): length (n) class designation for each of the n specimens present in cuffnormOutputPath; can include any type that may be used as a dict key.
"""
def __init__(self, cuffnormOutputPath, multiClasses):
specimens = self.getSpecimens(cuffnormOutputPath)
super(CuffnormMultiClass, self).__init__(specimens, multiClasses)
ensemblFormats = {'full', 'condensed'} # set {} has faster lookup than list []
[docs]def EnsemblLookup(ensemblIDs, lookupFormat='full', rebuildCache=False):
"""Use the Ensembl REST API 'lookup' to return data corresponding to a particular ID. Will create a local cache.
http://rest.ensembl.org/documentation/info/lookup
Args:
ensemblIDs (list or string): Ensembl IDs in a list; will accept a string.
lookupFormat (string): One of 'full' or 'condensed' as described in the API documentation.
rebuildCache (bool): If True, fetch fresh data from Ensembl even if a locally-cached version exists.
Returns:
list or dict: The JSON data returned by the API, converted to a dict. Will return a single dict if string ID passed.
Raises:
Exception: if an invalid format string is passed.
"""
if not lookupFormat in ensemblFormats:
raise Exception("Ensembl lookup can only be 'full' or 'condsensed'")
if isinstance(ensemblIDs, str):
ensemblIDs = [ensemblIDs]
returnAsList = False
else:
returnAsList = True
ensemblIDs = ensemblIDs
memo = shelve.open("./.ensemblCache-%d" % sys.version_info[0]) # technically don't need different file names, but local automated testing fails when using python3 to open python2 shelves
getFresh = [rebuildCache or (not idx in memo) or (memo[idx] is None) for idx in ensemblIDs]
urls = ["https://rest.ensembl.org/lookup/id/%s?content-type=application/json;format=%s" % (idx, lookupFormat) for (i, idx) in enumerate(ensemblIDs) if getFresh[i]]
rs = (grequests.get(u) for u in urls)
freshData = deque(grequests.map(rs))
data = []
for i, idx in enumerate(ensemblIDs):
if getFresh[i]:
memo[idx] = freshData.popleft() # freshData queue is only populated with those we know are not in the memo (or all if rebuildCache)
data.append(memo[idx])
memo.close()
def returnValue(d):
return d.json() if d is not None else dict()
if returnAsList:
return [returnValue(d) for d in data]
else:
return returnValue(data[0])