Source code for quail.analysis.analysis

#!/usr/bin/env python
from __future__ import division
from __future__ import print_function
from builtins import zip
from builtins import range
import six
import numpy as np
import pandas as pd
import warnings
import six
from ..helpers import *
from ..distance import dist_funcs as dist_funcs_dict
from .recmat import recall_matrix
from .accuracy import accuracy_helper
from .spc import spc_helper
from .pnr import pnr_helper
from .lagcrp import lagcrp_helper
from .clustering import fingerprint_helper

analyses = {
    'accuracy' : accuracy_helper,
    'spc' : spc_helper,
    'pfr' : pnr_helper,
    'pnr' : pnr_helper,
    'lagcrp' : lagcrp_helper,
    'fingerprint' : fingerprint_helper,
    'temporal' : fingerprint_helper
}

# main analysis function

[docs]
def analyze(egg, subjgroup=None, listgroup=None, subjname='Subject',
            listname='List', analysis=None, position=0, permute=False,
            n_perms=1000, parallel=False, match='exact',
            distance='euclidean', features=None, ts=None, n_jobs=-1):
    """
    General analysis function that groups data by subject/list number and performs analysis.

    Parameters
    ----------
    egg : Egg data object
        The data to be analyzed

    subjgroup : list of strings or ints
        String/int variables indicating how to group over subjects.  Must be
        the length of the number of subjects

    subjname : string
        Name of the subject grouping variable

    listgroup : list of strings or ints
        String/int variables indicating how to group over list.  Must be
        the length of the number of lists

    listname : string
        Name of the list grouping variable

    analysis : string
        This is the analysis you want to run.  Can be accuracy, spc, pfr,
        temporal or fingerprint

    position : int
        Optional argument for pnr analysis.  Defines encoding position of item
        to run pnr.  Default is 0, and it is zero indexed

    permute : bool
        Optional argument for fingerprint/temporal cluster analyses. Determines
        whether to correct clustering scores by shuffling recall order for each list
        to create a distribution of clustering scores (for each feature). The
        "corrected" clustering score is the proportion of clustering scores in
        that random distribution that were lower than the clustering score for
        the observed recall sequence. Default is False.

    n_perms : int
        Optional argument for fingerprint/temporal cluster analyses. Number of
        permutations to run for "corrected" clustering scores. Default is 1000 (
        per recall list).

    parallel : bool
        Option to use multiprocessing (this can help speed up the permutations
        tests in the clustering calculations)

    match : str (exact, best or smooth)
        Matching approach to compute recall matrix.  If exact, the presented and
        recalled items must be identical (default).  If best, the recalled item
        that is most similar to the presented items will be selected. If smooth,
        a weighted average of all presented items will be used, where the
        weights are derived from the similarity between the recalled item and
        each presented item.

    distance : str
        The distance function used to compare presented and recalled items.
        Applies only to 'best' and 'smooth' matching approaches.  Can be any
        distance function supported by numpy.spatial.distance.cdist.

    n_jobs : int
        Number of parallel jobs for fingerprint analysis. Default is -1 (all cores).
        Only used for fingerprint/temporal analyses when joblib is available.


    Returns
    ----------
    result : quail.FriedEgg
        Class instance containing the analysis results

    """
    if analysis is None:
        raise ValueError('You must pass an analysis type.')

    if analysis not in analyses.keys():
        raise ValueError('Analysis not recognized. Choose one of the following: '
                        'accuracy, spc, pfr, lagcrp, fingerprint, temporal')

    from ..egg import FriedEgg

    if hasattr(egg, 'subjgroup'):
        if egg.subjgroup is not None:
            subjgroup = egg.subjgroup

    if hasattr(egg, 'subjname'):
        if egg.subjname is not None:
            subjname = egg.subjname

    if hasattr(egg, 'listgroup'):
        if egg.listgroup is not None:
            listgroup = egg.listgroup

    if hasattr(egg, 'listname'):
        if egg.listname is not None:
            listname = egg.listname

    if features is None:
        features = egg.feature_names

    opts = {
        'subjgroup' : subjgroup,
        'listgroup' : listgroup,
        'subjname' : subjname,
        'parallel' : parallel,
        'match' : match,
        'distance' : distance,
        'features' : features,
        'analysis_type' : analysis,
        'analysis' : analyses[analysis]
    }

    if analysis == 'pfr':
        opts.update({'position' : 0})
    elif analysis == 'pnr':
        opts.update({'position' : position})
    if analysis == 'temporal':
        opts.update({'features' : ['Temporal']})
    if analysis in ['temporal', 'fingerprint']:
        opts.update({'permute' : permute, 'n_perms' : n_perms, 'n_jobs' : n_jobs})
    if analysis == 'lagcrp':
        opts.update({'ts' : ts})

    return FriedEgg(data=_analyze_chunk(egg, **opts), analysis=analysis,
                    list_length=egg.list_length, n_lists=egg.n_lists,
                    n_subjects=egg.n_subjects, position=position)


def _analyze_chunk(data, subjgroup=None, subjname='Subject', listgroup=None,
                   listname='List', analysis=None, analysis_type=None,
                   pass_features=False, features=None, parallel=False,
                   **kwargs):
    """
    Private function that groups data by subject/list number and performs
    analysis for a chunk of data.

    Parameters
    ----------
    data : Egg data object
        The data to be analyzed

    subjgroup : list of strings or ints
        String/int variables indicating how to group over subjects.  Must be
        the length of the number of subjects

    subjname : string
        Name of the subject grouping variable

    listgroup : list of strings or ints
        String/int variables indicating how to group over list.  Must be
        the length of the number of lists

    listname : string
        Name of the list grouping variable

    analysis : function
        This function analyzes data and returns it.

    pass_features : bool
        Logical indicating whether the analyses uses the features field of the
        Egg

    Returns
    ----------
    analyzed_data : Pandas DataFrame
        DataFrame containing the analysis results

    """

    # perform the analysis
    def _analysis(c):
        subj, lst = c
        subjects = [s for s in subjdict[subj]]
        lists = [l for l in listdict[subj][lst]]
        s = data.crack(lists=lists, subjects=subjects)
        index = pd.MultiIndex.from_arrays([[subj],[lst]], names=[subjname, listname])
        opts = dict()
        if analysis_type == 'fingerprint':
                opts.update({'columns' : features})
        elif analysis_type == 'lagcrp':
            if kwargs['ts']:
                opts.update({'columns' : range(-kwargs['ts'],kwargs['ts']+1)})
            else:
                opts.update({'columns' : range(-data.list_length,data.list_length+1)})
        return pd.DataFrame([analysis(s, features=features, **kwargs)],
                            index=index, **opts)

    subjgroup = subjgroup if subjgroup else data.pres.index.levels[0].values
    listgroup = listgroup if listgroup else data.pres.index.levels[1].values

    subjdict = {subj : data.pres.index.levels[0].values[subj==np.array(subjgroup)] for subj in set(subjgroup)}

    if all(isinstance(el, list) for el in listgroup):
        # Per-subject listgroup: listgroup is a list of lists, one per subject
        # Map subject indices to their listgroup dictionaries
        per_subject_listdict = []
        for listgrpsub in listgroup:
            ld = {lst : data.pres.index.levels[1].values[lst==np.array(listgrpsub)] for lst in set(listgrpsub)}
            per_subject_listdict.append(ld)

        # Create listdict keyed by subject group, mapping to the appropriate per-subject dict
        # For each subject group, find the corresponding subject indices and their listdicts
        listdict = {}
        for subj_group in subjdict:
            # Get subject indices that belong to this group
            subj_indices_in_group = subjdict[subj_group]
            if len(subj_indices_in_group) > 0:
                # Use the first subject's listdict as representative for the group
                # (assuming all subjects in a group have the same list groupings)
                first_subj_idx = subj_indices_in_group[0]
                if first_subj_idx < len(per_subject_listdict):
                    listdict[subj_group] = per_subject_listdict[first_subj_idx]
                else:
                    # Fallback: use first listdict
                    listdict[subj_group] = per_subject_listdict[0]
    else:
        # Shared list grouping
        ld = {lst : data.pres.index.levels[1].values[lst==np.array(listgroup)] for lst in set(listgroup)}
        listdict = {subj : ld for subj in subjdict}

    # Now listdict is always a dict keyed by subject group
    chunks = [(subj, lst) for subj in subjdict for lst in listdict[subj]]

    if parallel:
        import multiprocessing
        from pathos.multiprocessing import ProcessingPool as Pool
        try:
            p = Pool(multiprocessing.cpu_count())
            res = p.map(_analysis, chunks)
        finally:
            p.close()
            p.join()
            p.clear() 
    else:
        res = [_analysis(c) for c in chunks]

    return pd.concat(res)