# written by John Alroy on 20 July 2018
# carries out shareholder quorum subsampling of published occurrence data,
#  limiting the number of samples drawn from each reference
# expects occurrence, sample, and reference arrays of the same length
# merge.repeats option added by Hendrik Nowak on 20 July 2018

sqsbyref<-function(occurrences,samples,references,quorum=0.5,trials=100,sample.quota=1,merge.repeats=F)	{
	# convert taxon names to numbers to speed things up
	occurrences <- xtfrm(occurrences)
	# create a lookup of reference numbers corresponding to sample numbers
	lookup <- array()
	lookup[samples] <- references
	# create a list of unique references
	scrambled <- unique(references)
	crosses <- 0
	cross <- array()
	for (i in 1:trials)	{
		# randomly order the list of references
		scrambled <- sample(scrambled)
		subsamples <- array()
		drawn <- 0
		lastu <- 0
		lastr <- 0
		for (j in 1:length(scrambled))	{
			# get sample indices matching the reference
			s <- which(lookup == scrambled[j])
			# for references over the quota, draw a subset
			if (length(s) > sample.quota)
				s <- s[sample(length(s),sample.quota)]
			# append samples to a growing list
			subsamples[(drawn + 1):(drawn + length(s))] <- s
			drawn <- drawn + length(s)
			# compute the abundance distribution
			# tabulate is faster than table
			# by default, leave the occurrences alone
			if (merge.repeats != T)
				n <- tabulate(occurrences[samples %in% subsamples])
			# or merge occurrences yielded by the same reference
			else
				n <- tabulate(unique(data.frame(occurrences=occurrences[samples %in% subsamples],references=references[samples %in% subsamples]))$occurrences)
			# tabulate returns zero counts, unlike table
			n <- n[which(n > 0)]
			# compute Good's u
			u <- 1 - length(which(n == 1)) / sum(n)
			# record richness when it surpasses the quorum
			if (lastu < quorum && u > quorum)	{
				crosses <- crosses + 1
				cross[crosses] <- (length(n) + lastr) / 2
			} else if (lastu < quorum && u == quorum)	{
				crosses <- crosses + 1
				cross[crosses] <- length(n)
			}
			lastu <- u
			lastr <- length(n)
		}
	}
	# failure to cross is unacceptable
	if (is.na(cross[1]))
		return(c(NA,NA,NA))
	# return the median and 95% confidence interval
	return(quantile(cross,probs=c(0.025,0.5,0.975),na.rm=T))
}