from __future__ import division
from __future__ import print_function
from builtins import zip
from builtins import str
from builtins import range
import json
import re
import csv
import pickle
import os
import pandas as pd
import numpy as np
import deepdish as dd
from .egg import Egg, FriedEgg
from .helpers import parse_egg, stack_eggs
try:
from sqlalchemy import create_engine, MetaData, Table
except:
pass
def load(filepath, update=True):
"""
Loads eggs, fried eggs ands example data
Parameters
----------
filepath : str
Location of file
update : bool
If true, updates egg to latest format
Returns
----------
data : quail.Egg or quail.FriedEgg
Data loaded from disk
"""
if filepath == 'automatic' or filepath == 'example':
fpath = os.path.dirname(os.path.abspath(__file__)) + '/data/automatic.egg'
return load_egg(fpath)
elif filepath == 'manual':
fpath = os.path.dirname(os.path.abspath(__file__)) + '/data/manual.egg'
return load_egg(fpath, update=False)
elif filepath == 'naturalistic':
fpath = os.path.dirname(os.path.abspath(__file__)) + '/data/naturalistic.egg'
elif filepath.split('.')[-1]=='egg':
return load_egg(filepath, update=update)
elif filepath.split('.')[-1]=='fegg':
return load_fegg(filepath, update=False)
else:
raise ValueError('Could not load file.')
def load_fegg(filepath, update=True):
"""
Loads pickled egg
Parameters
----------
filepath : str
Location of pickled egg
update : bool
If true, updates egg to latest format
Returns
----------
egg : Egg data object
A loaded unpickled egg
"""
try:
egg = FriedEgg(**dd.io.load(filepath))
except ValueError as e:
print(e)
# if error, try loading old format
with open(filepath, 'rb') as f:
egg = pickle.load(f)
if update:
return egg.crack()
else:
return egg
[docs]def load_egg(filepath, update=True):
"""
Loads pickled egg
Parameters
----------
filepath : str
Location of pickled egg
update : bool
If true, updates egg to latest format
Returns
----------
egg : Egg data object
A loaded unpickled egg
"""
try:
egg = Egg(**dd.io.load(filepath))
except:
# if error, try loading old format
with open(filepath, 'rb') as f:
egg = pickle.load(f)
if update:
if egg.meta:
old_meta = egg.meta
egg.crack()
egg.meta = old_meta
return egg
else:
return egg.crack()
else:
return egg
def loadEL(dbpath=None, recpath=None, remove_subs=None, wordpool=None, groupby=None, experiments=None,
filters=None):
'''
Function that loads sql files generated by autoFR Experiment
'''
assert (dbpath is not None), "You must specify a db file or files."
assert (recpath is not None), "You must specify a recall folder."
assert (wordpool is not None), "You must specify a wordpool file."
assert (experiments is not None), "You must specify a list of experiments"
############################################################################
# subfunctions #############################################################
def db2df(db, filter_func=None):
'''
Loads db file and converts to dataframe
'''
db_url = "sqlite:///" + db
table_name = 'turkdemo'
data_column_name = 'datastring'
# boilerplace sqlalchemy setup
engine = create_engine(db_url)
metadata = MetaData()
metadata.bind = engine
table = Table(table_name, metadata, autoload=True)
# make a query and loop through
s = table.select()
rows = s.execute()
data = []
for row in rows:
data.append(row[data_column_name])
# parse each participant's datastring as json object
# and take the 'data' sub-object
data = [json.loads(part)['data'] for part in data if part is not None]
# remove duplicate subject data for debugXG82XV:debug7XPXQA
# data[110] = data[110][348:]
# insert uniqueid field into trialdata in case it wasn't added
# in experiment:
for part in data:
for record in part:
# print(record)
if type(record['trialdata']) is list:
record['trialdata'] = {record['trialdata'][0]:record['trialdata'][1]}
record['trialdata']['uniqueid'] = record['uniqueid']
# flatten nested list so we just have a list of the trialdata recorded
# each time psiturk.recordTrialData(trialdata) was called.
def isNotNumber(s):
try:
float(s)
return False
except ValueError:
return True
data = [record['trialdata'] for part in data for record in part]
# filter out fields that we dont want using isNotNumber function
filtered_data = [{k:v for (k,v) in list(part.items()) if isNotNumber(k)} for part in data]
# Put all subjects' trial data into a dataframe object from the
# 'pandas' python library: one option among many for analysis
data_frame = pd.DataFrame(filtered_data)
data_column_name = 'codeversion'
# boilerplace sqlalchemy setup
engine = create_engine(db_url)
metadata = MetaData()
metadata.bind = engine
table = Table(table_name, metadata, autoload=True)
# make a query and loop through
s = table.select()
rows = s.execute()
versions = []
version_dict = {}
for row in rows:
version_dict[row[0]]=row[data_column_name]
version_col = []
for idx,sub in enumerate(data_frame['uniqueid'].unique()):
for i in range(sum(data_frame['uniqueid']==sub)):
version_col.append(version_dict[sub])
data_frame['exp_version']=version_col
if filter_func:
for idx,filt in enumerate(filter_func):
data_frame = filt(data_frame)
return data_frame
# custom filter to clean db file
def experimenter_filter(data_frame):
data=[]
indexes=[]
for line in data_frame.iterrows():
try:
if json.loads(line[1]['responses'])['Q1'].lower() in ['kirsten','allison','allison\nallison','marisol', 'marisol ','marisiol', 'maddy','campbell', 'campbell field', 'kirsten\nkirsten', 'emily', 'bryan', 'armando', 'armando ortiz',
'maddy/lucy','paxton', 'lucy','campbell\ncampbell','madison','darya','rachael']:
delete = False
else:
delete = True
except:
pass
if delete:
indexes.append(line[0])
return data_frame.drop(indexes)
def adaptive_filter(data_frame):
data=[]
indexes=[]
subjcb={}
for line in data_frame.iterrows():
try:
if 'Q2' in json.loads(line[1]['responses']):
delete = False
else:
delete = False
except:
pass
if delete:
indexes.append(line[0])
return data_frame.drop(indexes)
def experiments_filter(data_frame):
indexes=[]
for line in data_frame.iterrows():
try:
if line[1]['exp_version'] in experiments:
delete = False
else:
delete = True
except:
pass
if delete:
indexes.append(line[0])
return data_frame.drop(indexes)
# this function takes the data frame and returns subject specific data based on the subid variable
def filterData(data_frame,subid):
filtered_stim_data = data_frame[data_frame['stimulus'].notnull() & data_frame['listNumber'].notnull()]
filtered_stim_data = filtered_stim_data[filtered_stim_data['trial_type']=='single-stim']
filtered_stim_data = filtered_stim_data[filtered_stim_data['uniqueid']==subid]
return filtered_stim_data
def createStimDict(data):
stimDict = []
for index, row in data.iterrows():
try:
stimDict.append({
'text': str(re.findall('>(.+)<',row['stimulus'])[0]),
'color' : { 'r' : int(re.findall('rgb\((.+)\)',row['stimulus'])[0].split(',')[0]),
'g' : int(re.findall('rgb\((.+)\)',row['stimulus'])[0].split(',')[1]),
'b' : int(re.findall('rgb\((.+)\)',row['stimulus'])[0].split(',')[2])
},
'location' : {
'top': float(re.findall('top:(.+)\%;', row['stimulus'])[0]),
'left' : float(re.findall('left:(.+)\%', row['stimulus'])[0])
},
'category' : wordpool['CATEGORY'].iloc[list(wordpool['WORD'].values).index(str(re.findall('>(.+)<',row['stimulus'])[0]))],
'size' : wordpool['SIZE'].iloc[list(wordpool['WORD'].values).index(str(re.findall('>(.+)<',row['stimulus'])[0]))],
'wordLength' : len(str(re.findall('>(.+)<',row['stimulus'])[0])),
'firstLetter' : str(re.findall('>(.+)<',row['stimulus'])[0])[0],
'listnum' : row['listNumber']
})
except:
stimDict.append({
'text': str(re.findall('>(.+)<',row['stimulus'])[0]),
'color' : { 'r' : 0,
'g' : 0,
'b' : 0
},
'location' : {
'top': 50,
'left' : 50
},
'category' : wordpool['CATEGORY'].iloc[list(wordpool['WORD'].values).index(str(re.findall('>(.+)<',row['stimulus'])[0]))],
'size' : wordpool['SIZE'].iloc[list(wordpool['WORD'].values).index(str(re.findall('>(.+)<',row['stimulus'])[0]))],
'wordLength' : len(str(re.findall('>(.+)<',row['stimulus'])[0])),
'firstLetter' : str(re.findall('>(.+)<',row['stimulus'])[0])[0],
'listnum' : row['listNumber']
})
return stimDict
# this function loads in the recall data into an array of arrays, where each array represents a list of words
def loadRecallData(subid):
recalledWords = []
for i in range(0,16):
try:
f = open(recpath + subid + '/' + subid + '-' + str(i) + '.wav.txt', 'r')
spamreader = csv.reader(f, delimiter=',', quotechar='|')
except (IOError, OSError) as e:
try:
f = open(recpath + subid + '-' + str(i) + '.wav.txt', 'r')
spamreader = csv.reader(f, delimiter=',', quotechar='|')
except (IOError, OSError) as e:
print(e)
try:
words=[]
altformat=True
for row in spamreader:
if len(row)>1:
recalledWords.append(row)
altformat=False
break
else:
try:
words.append(row[0])
except:
pass
if altformat:
recalledWords.append(words)
except:
print('couldnt process '+ recpath + subid + '/' + subid + '-' + str(i) + '.wav.txt')
return recalledWords
# this function computes accuracy for a series of lists
def computeListAcc(stimDict,recalledWords):
accVec = []
for i in range(0,16):
stim = [stim['text'] for stim in stimDict if stim['listnum']==i]
recalled= recalledWords[i]
acc = 0
tmpstim = stim[:]
for word in recalled:
if word in tmpstim:
tmpstim.remove(word)
acc+=1
accVec.append(acc/len(stim))
return accVec
def getFeatures(stimDict):
stimDict_copy = stimDict[:]
for item in stimDict_copy:
item['location'] = [item['location']['top'], item['location']['left']]
item['color'] = [item['color']['r'], item['color']['g'], item['color']['b']]
item.pop('text', None)
item.pop('listnum', None)
stimDict_copy = [stimDict_copy[i:i+16] for i in range(0, len(stimDict_copy), 16)]
return stimDict_copy
############################################################################
# main program #############################################################
# if its not a list, make it one
if type(dbpath) is not list:
dbpath = [dbpath]
# read in stimulus library
wordpool = pd.read_csv(wordpool)
# add custom filters
if filters:
filter_func = [adaptive_filter, experimeter_filter, experiments_filter] + filters
else:
filter_func = [adaptive_filter, experimenter_filter, experiments_filter]
# load in dbs and convert to df, and filter
dfs = [db2df(db, filter_func=filter_func) for db in dbpath]
# concatenate the db files
df = pd.concat(dfs)
# subjects who have completed the exp
subids = list(df[df['listNumber']==15]['uniqueid'].unique())
# remove problematic subjects
if remove_subs:
for sub in remove_subs:
try:
subids.remove(sub)
except:
print('Could not find subject: ' + sub + ', skipping...')
# set up data structure to load in subjects
if groupby:
pres = [[] for i in range(len(groupby['exp_version']))]
rec = [[] for i in range(len(groupby['exp_version']))]
features = [[] for i in range(len(groupby['exp_version']))]
subs = [[] for i in range(len(groupby['exp_version']))]
# make each groupby item a list
groupby = [exp if type(exp) is list else [exp] for exp in groupby['exp_version']]
else:
pres = [[]]
rec = [[]]
features = [[]]
subs = [[]]
# for each subject that completed the experiment
for idx,sub in enumerate(subids):
# get the subjects data
filteredStimData = filterData(df,sub)
if filteredStimData['exp_version'].values[0] in experiments:
# create stim dict
stimDict = createStimDict(filteredStimData)
sub_data = pd.DataFrame(stimDict)
sub_data['subject']=idx
sub_data['experiment']=filteredStimData['exp_version'].values[0]
sub_data = sub_data[['experiment','subject','listnum','text','category','color','location','firstLetter','size','wordLength']]
# get features from stim dict
feats = getFeatures(stimDict)
# load in the recall data
recalledWords = loadRecallData(sub)
# get experiment version
exp_version = filteredStimData['exp_version'].values[0]
# find the idx of the experiment for this subjects
if groupby:
exp_idx = list(np.where([exp_version in item for item in groupby])[0])
else:
exp_idx = [0]
if exp_idx != []:
pres[exp_idx[0]].append([list(sub_data[sub_data['listnum']==lst]['text'].values) for lst in sub_data['listnum'].unique()])
rec[exp_idx[0]].append(recalledWords)
features[exp_idx[0]].append(feats)
subs[exp_idx[0]].append(sub)
eggs = [Egg(pres=ipres, rec=irec, features=ifeatures, meta={'ids' : isub}) for ipres,irec,ifeatures,isub in zip(pres, rec, features, subs)]
# map feature dictionaries in pres df to rec df
def checkword(x):
if x is None:
return x
else:
try:
return stim_dict[x['item']]
except:
return x
# convert utf-8 bytes type to string
def update_types(egg):
featlist = list(egg.pres.loc[0].loc[0].values.tolist()[0].keys())
def update1df(df):
for sub in range(egg.n_subjects):
for liszt in range(egg.n_lists):
for item in range(len(df.loc[sub].loc[liszt].values.tolist())):
for feat in featlist:
if feat in df.loc[sub].loc[liszt].values.tolist()[item].keys():
if isinstance(df.loc[sub].loc[liszt].values.tolist()[item][feat], np.bytes_):
try:
df.loc[sub].loc[liszt].values.tolist()[item][feat] = str(df.loc[sub].loc[liszt].values.tolist()[item][feat], 'utf-8')
except:
print("Subject " + str(sub) + ", list " + str(liszt) + ", item " + str(item) + ", feature " + str(feat) + ": Could not convert type " + str(type(egg.rec.loc[sub].loc[liszt].values.tolist()[item][feat])) + " to string.")
update1df(egg.pres)
update1df(egg.rec)
for egg in eggs:
update_types(egg)
old_meta = egg.meta
temp_eggs = [egg]
for i in range(egg.n_subjects):
e = egg.crack(subjects=[i])
stim = e.pres.values.ravel()
stim_dict = {str(x['item']) : {k:v for k, v in iter(x.items())} for x in stim}
e.rec = e.rec.applymap(lambda x: checkword(x))
temp_eggs.append(e)
edited_egg = stack_eggs(temp_eggs)
mapped_egg = edited_egg.crack(subjects=[i for i in range(egg.n_subjects,egg.n_subjects*2)])
mapped_egg.meta = old_meta
eggs[eggs.index(egg)] = mapped_egg
if len(eggs)>1:
return eggs
else:
return eggs[0]
[docs]def load_example_data(dataset='automatic'):
"""
Loads example data
The automatic and manual example data are eggs containing 30 subjects who completed a free
recall experiment as described here: https://psyarxiv.com/psh48/. The subjects
studied 8 lists of 16 words each and then performed a free recall test.
The naturalistic example data is is an egg containing 17 subjects who viewed and verbally
recounted an episode of the BBC series Sherlock, as described here:
https://www.nature.com/articles/nn.4450. We fit a topic model to hand-annotated
text-descriptions of scenes from the video and used the model to transform both the
scene descriptions and manual transcriptions of each subject's verbal recall. We then
used a Hidden Markov Model to segment the video model and the recall models, by subject,
into k events.
Parameters
----------
dataset : str
The dataset to load. Can be 'automatic', 'manual', or 'naturalistic'. The free recall
audio recordings for the 'automatic' dataset was transcribed by Google
Cloud Speech and the 'manual' dataset was transcribed by humans. The 'naturalistic'
dataset was transcribed by humans and transformed as described above.
Returns
----------
data : quail.Egg
Example data
"""
# can only be auto or manual
assert dataset in ['automatic', 'manual', 'naturalistic'], "Dataset can only be automatic, manual, or naturalistic"
if dataset == 'naturalistic':
# open naturalistic egg
egg = Egg(**dd.io.load(os.path.dirname(os.path.abspath(__file__)) + '/data/' + dataset + '.egg'))
else:
# open pickled egg
try:
with open(os.path.dirname(os.path.abspath(__file__)) + '/data/' + dataset + '.egg', 'rb') as handle:
egg = pickle.load(handle)
except:
f = dd.io.load(os.path.dirname(os.path.abspath(__file__)) + '/data/' + dataset + '.egg')
egg = Egg(pres=f['pres'], rec=f['rec'], dist_funcs=f['dist_funcs'],
subjgroup=f['subjgroup'], subjname=f['subjname'],
listgroup=f['listgroup'], listname=f['listname'],
date_created=f['date_created'])
return egg.crack()