# written by John Alroy on 20 July 2018 # carries out shareholder quorum subsampling of published occurrence data, # limiting the number of samples drawn from each reference # expects occurrence, sample, and reference arrays of the same length # merge.repeats option added by Hendrik Nowak on 20 July 2018 sqsbyref<-function(occurrences,samples,references,quorum=0.5,trials=100,sample.quota=1,merge.repeats=F) { # convert taxon names to numbers to speed things up occurrences <- xtfrm(occurrences) # create a lookup of reference numbers corresponding to sample numbers lookup <- array() lookup[samples] <- references # create a list of unique references scrambled <- unique(references) crosses <- 0 cross <- array() for (i in 1:trials) { # randomly order the list of references scrambled <- sample(scrambled) subsamples <- array() drawn <- 0 lastu <- 0 lastr <- 0 for (j in 1:length(scrambled)) { # get sample indices matching the reference s <- which(lookup == scrambled[j]) # for references over the quota, draw a subset if (length(s) > sample.quota) s <- s[sample(length(s),sample.quota)] # append samples to a growing list subsamples[(drawn + 1):(drawn + length(s))] <- s drawn <- drawn + length(s) # compute the abundance distribution # tabulate is faster than table # by default, leave the occurrences alone if (merge.repeats != T) n <- tabulate(occurrences[samples %in% subsamples]) # or merge occurrences yielded by the same reference else n <- tabulate(unique(data.frame(occurrences=occurrences[samples %in% subsamples],references=references[samples %in% subsamples]))$occurrences) # tabulate returns zero counts, unlike table n <- n[which(n > 0)] # compute Good's u u <- 1 - length(which(n == 1)) / sum(n) # record richness when it surpasses the quorum if (lastu < quorum && u > quorum) { crosses <- crosses + 1 cross[crosses] <- (length(n) + lastr) / 2 } else if (lastu < quorum && u == quorum) { crosses <- crosses + 1 cross[crosses] <- length(n) } lastu <- u lastr <- length(n) } } # failure to cross is unacceptable if (is.na(cross[1])) return(c(NA,NA,NA)) # return the median and 95% confidence interval return(quantile(cross,probs=c(0.025,0.5,0.975),na.rm=T)) }