#######################
#This script is an example of one way that you could identify and remove outliers
#There is no generally agreed upon method for identifying outliers in acceptability judgment experiments
#One method peolpe use is to have "gold standard" items -- items with known ratings, so that you can throw out participants who "get them wrong"
#But given that acceptability is gradient, "wrong" is a continuum
#So one thing you can do is calculate how far away from the known judgment each participant is.
#Then you can remove any participants who is farther away from the known judgments than other participants.
#This script does this for all of the fillers in the experiments, because each one was pre-tested, so each one can be a gold standard item
#######################
#NOTE: another way to deal with outliers is to have really large samples, then the effect of any one outlier is small, so you don't need to do anything!
#I prefer large samples (made easier by AMT). But I wanted to show you a method for outlier removal just for completeness.
#######################
#read in the long format data
d=read.csv("results.long.format.csv")
#create a subset of the data that is only the gold standard items; in this case, the fillers of the experiment
gold.standards=subset(d, condition =="7F" | condition =="6F" | condition =="5F" | condition =="4F" | condition =="3F" | condition =="2F" | condition =="1F")
gold.standards = droplevels(gold.standards)
#Now we need to know the expected value for each gold standard item. You could use a new file "keys.csv" for this.
#In this case, the expected judgment is right there in the name, so I can just use that.
#Remove F from the conditon names
expected = sub('F', "", as.character(gold.standards$condition))
#convert from text into a number:
expected =as.numeric(expected)
#calculate the difference between the filler judgment and the expected judgment
difference = gold.standards$judgment - expected
#recompose the dataset
fillers=data.frame(gold.standards, expected, difference)
#calculate a metric for each subject
#In this case, I am going to use sum of squares, as that is a fairly common way of removing negative numbers
ss=function(x){
sum((x^2), na.rm=T)
}
#Calculate ss for each subject
subject.ss=with(gold.standards,
aggregate(
list(sumsquares=difference),
list(
subject=subject
),
ss #this is the function applied to the collapsing variable
)
)
#sort the new dataset that contains the sum of squares for each subject
subject.ss=subject.ss[with(subject.ss, order(sumsquares)),]
#calculate the mean and standard deviation of the sum of squares of the participants
x=mean(subject.ss$sumsquares)
y=sd(subject.ss$sumsquares)
#set a criterion for identifying an outlier. A common one is 2 standard deviations away from the mean. Since ss is always positive, we only have to look in one direction.
criterion=x+(2*y)
#identify outliers as anyone above the criterion
outliers = subject.ss[subject.ss$sumsquares>criterion,]
#remove the outlier subjects from the dataset
#first we get their subject ids
outlier.subjects=outliers$subject
#then we create a subset of the original data minus anyone with one of those subject ids
d2=subset(d, !subject %in% outlier.subjects)
#finally, we can write the new dataset to a csv for safe keeping
write.csv(d2, file="results.no.outliers.csv", row.names=F)