Source code for cr.cube.crunch_cube

'''Home of the CrunchCube class.

This module contains the definition of the CrunchCube class. It represents
the open-source library used for manipulating the crunch cubes (JSON responses
from the Crunch.io platform).
'''
from __future__ import division
import json

import numpy as np

from cr.cube.dimension import Dimension


[docs]class CrunchCube(object): '''Implementation of the CrunchCube API class. Class is used for the implementation of the main API functions that are needed for seamless integration with the crunch cube responses (from Crunch.io platform). Main API functions are: - as_array - margin - proportions - percentages These functions are used to retrieve statistical information of interest, from the JSON like crunch cubes. Complete usage of each API function is described within the appropriate docstring. Crunch Cubes contain richer metadata than standart Python objects, and they also conceal certain complexity in the data structures from the user. In particular, Multiple Response variables are generally represented as single dimensions in result tables, but in the actual data, they may comprise of two dimensions. These methods (API) understand the subtleties in the Crunch data types, and correctly compute margins and percentages off of them. ''' def __init__(self, response): '''Initializes the CrunchCube class with the cube JSON response. Class can be initialized with both JSON string, and dict types. There's no real parsing of the data at the point of initialization, just storing. The functionality is implemented in the appropriate API functions. ''' # @cube: Represents the cube response object, as generated by # the zz9d cube factory. # Cube can come in as a JSON or as a dictionary, So we need to check # its type, and convert it to a dictionary if it's JSON, if possible. if isinstance(response, str): response = json.loads(response) # If conversion to dict is not possible, or an unexpected type is # provided, fail sooner rather than later. if not isinstance(response, dict): raise TypeError(( 'Unsupported type provided: {}. ' 'A `cube` must be JSON or `dict`.' ).format(type(response))) self._cube = response['value'] @classmethod def _get_dimensions(cls, cube): '''Gets the dimensions of the crunch cube. This function is internal, and is not mean to be used by ouside users of the CrunchCube class. The main reason for this is the internal representation of the different variable types (namely the MR and the CA). These types have two dimensions each, but in the case of MR, the second dimensions shouldn't be visible to the user. This function returns such dimensions as well, since they're necessary for the correct implementation of the functionality for the MR type. The version that is mentioned to be used by users is the property 'dimensions'. ''' entries = cube['result']['dimensions'] return [ ( # Multiple Response and Categorical Array variables have # two subsequent dimensions (elements and selections). For # this reason it's necessary to pass in both of them in the # Dimension class init method. This is needed in order to # determine the correct type (CA or MR). We only skip the # two-argument constructor for the last dimension in the list # (where it's not possible to fetch the subsequent one). Dimension(entry) if i + 1 >= len(entries) else Dimension(entry, entries[i + 1]) ) for (i, entry) in enumerate(entries) ] @classmethod def _get_mr_selections_indices(cls, dimensions): '''Gets indices of each 'selection' dim, for corresponding MR dim. MR and CA variables are represented by two dimensions each. These dimensions can be thought of as 'elements' and 'selections'. This function returns the indices of the 'selections' dimension for each MR variable. ''' mr_dimensions_indices = [ i for (i, dim) in enumerate(dimensions) if (i + 1 < len(dimensions) and dim.type == 'multiple_response') ] # For each MR and CA dimension, the 'selections' dimension # follows right after it (in the originating cube). # Here we increase the MR index by 1, which gives us # the index of the corresponding 'selections' dimension. return [i + 1 for i in mr_dimensions_indices] @classmethod def _get_valid_indices(cls, dimensions, include_missing, get_non_selected): '''Gets valid indices for each dimension. Main criterion for a valid index is most often the information about whether the corresponding value of the dimension is missing or not. For MR variables, since they use two dimensions, the valid index for the 'selections' dimensions is [0], except in the case of non-selected slices calculation, where it needs to be [1]. ''' valid_indices = [dim.valid_indices(include_missing) for dim in dimensions] mr_selections_indices = cls._get_mr_selections_indices(dimensions) mr_slice = [1] if get_non_selected else [0] if mr_selections_indices: # In the case of MR variables, we only need to select the # 'selected' slice of the 'selections' dimension. valid_indices = [ ( valid_indices[i] if i not in mr_selections_indices else mr_slice ) for (i, _) in enumerate(valid_indices) ] return valid_indices @classmethod def _fix_shape(cls, array): '''Fixes shape of MR variables. For MR variables, where 'selections' dims are dropped, the ndarray needs to be reshaped, in order to seem as if those dims never existed. ''' new_shape = [dim for dim in array.shape if dim != 1] return array.reshape(new_shape) def _non_weighted_counts(self): pass def _as_array(self, include_missing=False, get_non_selected=False, weighted=True): '''Get crunch cube as ndarray. Args include_missing (bool): Include rows/cols for missing values get_non_selected (bool): Get non-selected slices for MR vars Returns res (ndarray): Tabular representation of crunch cube ''' counts = ( self._cube['result']['measures']['count']['data'] if weighted else self._cube['result']['counts'] ) all_dimensions = self._get_dimensions(self._cube) shape = [len(dim.elements) for dim in all_dimensions] valid_indices = self._get_valid_indices( all_dimensions, include_missing, get_non_selected ) res = np.array(counts).reshape(shape)[np.ix_(*valid_indices)] return self._fix_shape(res) # API Functions
[docs] def labels(self, include_missing=False): '''Gets labels for each cube's dimension. Args include_missing (bool): Include labels for missing values Returns labels (list of lists): Labels for each dimension ''' return [dim.labels(include_missing) for dim in self.dimensions]
@property def dimensions(self): '''Dimensions of the crunch cube.''' all_dimensions = self._get_dimensions(self._cube) mr_selections = self._get_mr_selections_indices(all_dimensions) return [ dim for (i, dim) in enumerate(all_dimensions) if i not in mr_selections ]
[docs] def as_array(self, include_missing=False, weighted=True): '''Get crunch cube as ndarray. Returns the tabular representation of the crunch cube. The returning value has as many dimensions, as there are dimensions in the crunch cube itself. E.g. for a cross-tab representation of a categorical and numerical variable, the resulting cube will have two dimensions. Args include_missing (bool): Include rows/cols for missing values Returns (ndarray): Tabular representation of the crunch cube Example 1 (Categorical x Categorical): >>> cube = CrunchCube(response) >>> cube.as_array() np.array([ [5, 2], [5, 3], ]) Example 2 (Categorical x Categorical, include missing values): >>> cube = CrunchCube(response) >>> cube.as_array(include_missing=True) np.array([ [5, 3, 2, 0], [5, 2, 3, 0], [0, 0, 0, 0], ]) ''' return self._as_array( include_missing=include_missing, weighted=weighted )
[docs] def margin(self, axis=None, weighted=True): '''Get margin for the selected axis. the selected axis. For MR variables, this is the sum of the selected and non-selected slices. Args axis (int): Axis across the margin is calculated. If no axis is provided the margin is calculated across all axis. For Categoricals, Num, Datetime, and Text, this translates to sumation of all elements. Returns Calculated margin for the selected axis Example 1: >>> cube = CrunchCube(fixt_cat_x_cat) np.array([ [5, 2], [5, 3], ]) >>> cube.margin(axis=0) np.array([10, 5]) Example 2: >>> cube = CrunchCube(fixt_cat_x_num_x_datetime) np.array([ [[1, 1], [0, 0], [0, 0], [0, 0]], [[2, 1], [1, 1], [0, 0], [0, 0]], [[0, 0], [2, 3], [0, 0], [0, 0]], [[0, 0], [0, 0], [3, 2], [0, 0]], [[0, 0], [0, 0], [1, 1], [0, 1]] ]) >>> cube.margin(axis=0) np.array([ [3, 2], [3, 4], [4, 3], [0, 1], ]) ''' array = self.as_array(weighted=weighted) all_dimensions = self._get_dimensions(self._cube) if self._get_mr_selections_indices(all_dimensions): margin = array + self._as_array(get_non_selected=True, weighted=weighted) if axis is None and len(margin.shape) > 1: return np.sum(margin, 0) return margin return np.sum(array, axis)
[docs] def proportions(self, axis=None): '''Get proportions of a crunch cube. This function calculates the proportions across the selected axis of a crunch cube. For most variable types, it means the value divided by the margin value. For Multiple Response types, the value is divied by the sum of selected and non-selected slices. Args axis (int): Base axis of proportions calculation. If no axis is provided, calculations are done accros entire table. Returns (nparray): Calculated array of crunch cube proportions. Example 1: >>> cube = CrunchCube(fixt_cat_x_cat) np.array([ [5, 2], [5, 3], ]) >>> cube.proportions() np.array([ [0.3333333, 0.1333333], [0.3333333, 0.2000000], ]) Example 2: >>> cube = CrunchCube(fixt_cat_x_cat) np.array([ [5, 2], [5, 3], ]) >>> cube.proportions(axis=0) np.array([ [0.5, 0.4], [0.5, 0.6], ]) ''' margin = self.margin(axis) if axis == 1: margin = margin[:, np.newaxis] return self.as_array() / margin
[docs] def percentages(self, axis=None): '''Get the percentages for crunch cube values. This function calculates the percentages for crunch cube values. The percentages are based on the values of the 'proportions'. Args axis (int): Base axis of percentages calculation. If no axis is provided, calculations are done accros entire table. Returns (nparray): Calculated array of crunch cube percentages. Example 1: >>> cube = CrunchCube(fixt_cat_x_cat) np.array([ [5, 2], [5, 3], ]) >>> cube.percentages() np.array([ [33.33333, 13.33333], [33.33333, 20.00000], ]) Example 2: >>> cube = CrunchCube(fixt_cat_x_cat) np.array([ [5, 2], [5, 3], ]) >>> cube.percentages(axis=0) np.array([ [50., 40.], [50., 60.], ]) ''' return self.proportions(axis) * 100