# written by John Alroy on 20 July 2018
# carries out shareholder quorum subsampling of published occurrence data,
# limiting the number of samples drawn from each reference
# expects occurrence, sample, and reference arrays of the same length
# merge.repeats option added by Hendrik Nowak on 20 July 2018
sqsbyref<-function(occurrences,samples,references,quorum=0.5,trials=100,sample.quota=1,merge.repeats=F) {
# convert taxon names to numbers to speed things up
occurrences <- xtfrm(occurrences)
# create a lookup of reference numbers corresponding to sample numbers
lookup <- array()
lookup[samples] <- references
# create a list of unique references
scrambled <- unique(references)
crosses <- 0
cross <- array()
for (i in 1:trials) {
# randomly order the list of references
scrambled <- sample(scrambled)
subsamples <- array()
drawn <- 0
lastu <- 0
lastr <- 0
for (j in 1:length(scrambled)) {
# get sample indices matching the reference
s <- which(lookup == scrambled[j])
# for references over the quota, draw a subset
if (length(s) > sample.quota)
s <- s[sample(length(s),sample.quota)]
# append samples to a growing list
subsamples[(drawn + 1):(drawn + length(s))] <- s
drawn <- drawn + length(s)
# compute the abundance distribution
# tabulate is faster than table
# by default, leave the occurrences alone
if (merge.repeats != T)
n <- tabulate(occurrences[samples %in% subsamples])
# or merge occurrences yielded by the same reference
else
n <- tabulate(unique(data.frame(occurrences=occurrences[samples %in% subsamples],references=references[samples %in% subsamples]))$occurrences)
# tabulate returns zero counts, unlike table
n <- n[which(n > 0)]
# compute Good's u
u <- 1 - length(which(n == 1)) / sum(n)
# record richness when it surpasses the quorum
if (lastu < quorum && u > quorum) {
crosses <- crosses + 1
cross[crosses] <- (length(n) + lastr) / 2
} else if (lastu < quorum && u == quorum) {
crosses <- crosses + 1
cross[crosses] <- length(n)
}
lastu <- u
lastr <- length(n)
}
}
# failure to cross is unacceptable
if (is.na(cross[1]))
return(c(NA,NA,NA))
# return the median and 95% confidence interval
return(quantile(cross,probs=c(0.025,0.5,0.975),na.rm=T))
}