## 12.10.2019: Cleanded and commented
## 8.02.2018: simplified MC model to speed up
## 9.18.2020: Eqn numbers refer to numbers in criterion document
chlrat <- function(df1, qa, varout = NULL, runmod = T) {

    require(rstan)
    require(mgcv)
    source("logtick.exp.R")
    source("binv.R")

    df1$site.id.c <- factor(df1$site.id.c)
    qa$idnew <- factor(qa$idnew)

    nchains <- 6  # number of concurrent chains in MCMC simulation

    ## set parameters here for calculating criteria
    p.exceed <- 0.1  # set allowable exceedance rate
    thold <- 8         # set MC threshold
    credint <- 0.9    # set credible interval for candidate criterion

    ## make mc into integer times 10
    df1$mc <- round(df1$mc*10)
    df1$mc[is.na(df1$mc)] <- 0

    ## drop below detection limit chl
    df1$chl[df1$chl==0] <- NA

    ## log transforms
    df1$log.cyano <- log(df1$biov.cyano*1e-6)
    df1$biov.tot <- log(df1$biov.tot*1e-6)
    df1$chl.loge <- log(df1$chl)

    ## drop samples missing cyano or chl
    incvec <- !is.na(df1$log.cyano) & ! is.na(df1$chl.loge)
    df1 <- df1[incvec,]

    ## make a new id based on site and visit no
    df1$id2 <- paste(df1$site.id.c, df1$visit.no, sep = "--")

    ## outliers: two cyano biov measurements that are very low omitted
    incvec <- df1$log.cyano < -3 & df1$mc > 50
    df1 <- df1[!incvec,]

    ## drop HI (not in conterminous US)
    incvec <- df1$st.nla2012 == "HI"
    df1 <- df1[!incvec,]

    df1 <- df1[, c("idnew",  "log.cyano", "prop.cyano", "biov.tot",
                   "chl.loge", "mc",
                   "st.nla2012", "us.l3code")]

    ## center chl and biov
    mnval.chl <- mean(df1$chl.loge)
    df1$chl.loge <- df1$chl.loge - mnval.chl
    mnval.biov <- mean(df1$biov.tot)
    df1$biov.tot <- df1$biov.tot - mnval.biov
    df1$log.cyano <- df1$log.cyano - mnval.biov

    ## prepare QA data
    qa$log.cyano1 <- log(qa$biov1.cyano*1.e-6)
    qa$log.cyano2 <- log(qa$biov2.cyano*1.e-6)
    qa$prop.cyano2 <- qa$biov2.cyano/qa$biov2
    qa$biov.tot2 <- log(qa$biov2*1.e-6) - mnval.biov

    ## only MSU resample is a duplicate so only use that one
    selvec <- ! is.na(qa$log.cyano2)
    qa <- qa[selvec,]
    df2 <- merge(df1, qa[, c("idnew", "log.cyano2", "prop.cyano2", "biov.tot2")],
                 by = "idnew")
    df1b <- df2[, c("idnew", "log.cyano2", "prop.cyano2", "biov.tot2")]
    names(df1b) <- c("idnew", "log.cyano", "prop.cyano", "biov.tot")

    ## add in blank columns so that I can rbind QA data with main data
    df1b$chl.loge <- NA
    df1b$mc <- NA
    df1b$st.nla2012 <- NA
    df1b$us.l3code <- NA
    df1 <- rbind(df1, df1b)

    ## subsample for model testing (sims are faster with a smaller dataset)
    #set.seed(10)   # random seed is only used when data is subsampled
    idu <- unique(df1$idnew)
    selvec <- is.na(df1$mc)
    idp <- df1$idnew[selvec]  # select replicates in subsample
    isamp <- sample(length(idu))
    n <- nrow(df1)
    idpick <- unique(c(idp, idu[isamp[1:round(n*0.20)]]))
    incvec <- rep(F, times = nrow(df1))
    for (i in idpick) incvec <- incvec|df1$idnew == i
    ## comment this out to stop subsampling
    #df1 <- df1[incvec,]

    ## reset site id factors
    df1$idnew <- factor(df1$idnew)
    df1$sampnum <- as.numeric(df1$idnew)
    w <- regexpr("--", as.character(df1$idnew))
    df1$site.id.c <- substring(as.character(df1$idnew), 1, w-1)
    df1$site.id.c <- factor(df1$site.id.c)
    df1$st.nla2012 <- factor(df1$st.nla2012)
    df1$istate <- as.numeric(df1$st.nla2012)
    df1$us.l3code <- factor(df1$us.l3code)

    ## specify dfsamp as samples with mc (not replicate cyano)
    incvec <- ! is.na(df1$mc)
    dfsamp <- df1[incvec,]
    dfsamp <- dfsamp[order(dfsamp$sampnum),]

    mnval <- c(mnval.chl, mnval.biov)
    names(mnval) <- c("chl","biov")

    ## save raw data for shiny
    save(dfsamp, mnval, file = "shinydat.rda")

    lookup <- unique.data.frame(dfsamp[, c("sampnum", "site.id.c")])
    lookup$sitenum <- as.numeric(lookup$site.id.c)
    lookup <- lookup[order(lookup$sampnum),]

    datstan <- list(n = nrow(df1),
                    nsamp = nrow(dfsamp),
                    chl = dfsamp$chl.loge,
                    chl2 = dfsamp$chl.loge^2,
                    nsite = max(lookup$sitenum),
                    sitenum = lookup$sitenum,
                    propcyano = round(df1$prop.cyano*100),
                    biov = df1$biov.tot,
                    id = df1$sampnum,
                    mc = dfsamp$mc,
                    nstate = length(levels(dfsamp$st.nla2012)),
                    istate = as.numeric(dfsamp$st.nla2012))

    print(str(datstan))

    modstan <- '
       data {
           int n;                  // total number of samples
           int nsamp;              // number of samples without QA
           vector[nsamp] chl;      // observed log chl
           vector[nsamp] chl2;     // log chl squared
           int nsite;              // number of lakes
           int sitenum[nsamp];     // lookup table for sitenum
           int propcyano[n];       // proportion cyanobacteria
           vector[n] biov;         // total phytoplankton biovolume
           int id[n];              // sample number for full dataset
           int mc[nsamp];          // observed MC
           int nstate;             // number of states
           int istate[nsamp];      // state associated with each sample
       }
       parameters {
          // parameters for chl-cyano relationship
           vector[3] muf;    // coef for quadratic relationship
           real<lower = 0> sig[4];  // variance components
           vector[nsamp] eta1;      // overparametrization variables
           vector[nsamp] eta2;
           vector[n] eta3;

           // parameters for cyano-mc relationship
           vector[3] d;  // coefficients for piecewise linear model
           real cp;      // changepoint
           real<lower = 0> phi; //over dispersion for neg bin

           // parameters for site mean chl
           real<lower = 0> sigchl[2];
           vector[nsite] etachl;
       }
       transformed parameters {
            vector[nsamp] biov_mean;
            vector[nsamp] alpha;
            vector[nsamp] cyano_mean;
            vector[n] alpha2;
            vector[nsamp] mcmean;
            vector[nsite] chlsite;

            chlsite = etachl*sigchl[1];

            // quadratic relationship for chl-propcyano (Eqn 17)
            alpha = muf[1] + muf[2] * chl + muf[3] * chl2 + sig[2]*eta2;
            alpha2 = alpha[id] + sig[3]*eta3;

            // compute mean cyano (Eqn 21)
            biov_mean = chl + eta1*sig[1];
            cyano_mean = biov_mean + log(inv_logit(alpha));

            // piecewise linear relationship for MC (Eqn 21)
            mcmean = d[1] + d[2]*cyano_mean +
               (d[3]-d[2])*(cyano_mean - cp) .* inv_logit(10*(cyano_mean-cp));
       }

       model {
            // weakly informative priors
            sigchl ~ cauchy(0,3);
            etachl ~ normal(0,1);
            sig ~ cauchy(0,3);
            eta1 ~ normal(0,1);
            eta2~ normal(0,1);
            eta3 ~ normal(0,1);
            d ~ normal(0,3);
            cp ~ normal(0,1); // prior for cp suggests changepoint is close
                              // to mean value of cyano biovolume
            phi ~ normal(0,3);

           // sampling statements
            chl ~ normal(chlsite[sitenum], sigchl[2]);
            biov ~ normal(biov_mean[id], sig[4]);
            propcyano ~ binomial_logit(100, alpha2);
            mc ~ neg_binomial_2_log(mcmean, phi);
       }
    '

    if (runmod) {

        rstan_options(auto_write= TRUE)
        options(mc.cores = nchains)
        fitout <- stan(model_code = modstan, data = datstan,
                       chains = nchains, iter = 1500, warmup = 300, thin = 4,
                       control = list(max_treedepth = 15, adapt_delta = 0.9))
        return(fitout)
    }
    else {

        grey.t <- adjustcolor("grey39", alpha.f = 0.5)

        ### plot chl vs. phytoplankton biovolume (Fig. 21 in document)
        dev.new()
        layout(matrix(c(1,2,3), nrow = 1, ncol = 3), widths = c(2, 2.25, 2.25))
        par(mar = c(4,4,1,1), mgp = c(2.3,1,0))
        plot(dfsamp$chl.loge + mnval["chl"], dfsamp$biov.tot + mnval["biov"],
             axes = F,
             xlab = expression(Chl~italic(a)~(mu*g/L)),
             ylab = expression(Phytoplankton~biovolume~(mm^3/L)),
             pch = 21, col = "grey", bg = "white")
        abline(-mnval["chl"]+mnval["biov"], 1)
        logtick.exp(0.0001, 10, c(1,2), c(F,F))

        ### plot chl vs. cyano relative abundance (prop.cyano)
        ## prepare prop.cyano for plotting by converting 1s and 0s to
        ## values close to 1 and 0, so they can be logit transformed
        incvec <- dfsamp$prop.cyano == 1
        dfsamp$prop.cyano[incvec] <- NA
        maxval <- max(dfsamp$prop.cyano, na.rm = T)
        dfsamp$prop.cyano[incvec] <- 1 - (1-maxval)*0.5
        incvec <- dfsamp$prop.cyano == 0
        dfsamp$prop.cyano[incvec] <- NA
        minval <- min(dfsamp$prop.cyano, na.rm = T)
        dfsamp$prop.cyano[incvec] <- 0.5*minval

        ## average prop.cyano in bins of about 20 samples
        bout <- binv(dfsamp$chl.loge,
                     qlogis(dfsamp$prop.cyano), 20)

        ## compute posterior predictions of mean relationship
        np <- 80   # posterior predictions at 80 points
        ## set up storage for median and upper and lower credible intervals
        predout <- matrix(NA, ncol = 3, nrow = np)
        xrange <- range(bout$xb)
        x <- seq(xrange[1], xrange[2], length = np)
        for (j in 1:np) {
            y <- varout$muf[,1] + varout$muf[,2]*x[j] +
                varout$muf[,3]*x[j]^2
            ## compute median prediction and 90% credible intervals
            predout[j,] <- quantile(y,prob = c(0.05,0.5,0.95))
        }

        plot(bout$xb+mnval["chl"], bout$yb, axes = F,
             pch = 21,  col = "grey39",
             bg = "white", ylab = "Cyanobacteria rel. abund.",
             xlab = expression(Chl~italic(a)~(mu*g/L)))
        logtick.exp(0.001, 10, c(1), c(F,T))
        ## vertical axis is logit transformed
        lab0 <- c(0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)
        at0 <- qlogis(lab0)
        axis(2, at = at0, lab = lab0)
        lab0 <- seq(0.025, 0.95, by = 0.025)
        at0 <- qlogis(lab0)
        axis(2, at = at0, lab = NA, tcl = -0.2)
        polygon(c(x, rev(x)) + mnval["chl"],
                c(predout[,1], rev(predout[,3])),
                border= NA, col = grey.t)
        lines(x + mnval["chl"], predout[,2])

        ## plot mean cyano biov vs. MC
        ## compute mean predicted cyano biov from model
        dfsamp$cyano.mean <- apply(varout$cyano_mean, 2, mean)

        ## mean microcystin in bins of 20
        bout <- binv(dfsamp$cyano.mean, dfsamp$mc, 20)
        bout$yb[bout$yb == 0] <- NA

        ## posterior predictions assuming mean cyano biov is known
        x <- seq(min(bout$xb), max(bout$xb),length = np)
        predout <- matrix(NA, ncol = 3, nrow=length(x))
        for (j in 1:length(x)) {
            mcmean <- varout$d[,1] + varout$d[,2]*x[j] +
                (varout$d[,3]-varout$d[,2])*
                    (x[j] - varout$cp)*as.numeric(x[j] > varout$cp)
            ## compute median and 90% credible intervals
            predout[j,] <- quantile(mcmean-log(10), prob = c(0.05, 0.5, 0.95))
        }
        ylim <- range(log(bout$yb*0.1), na.rm = T)
        plot(bout$xb, log(bout$yb*0.1), axes = F,
             pch = 21, col = "grey39", bg = "white",
             xlab = expression(Cyano~biovolume~(mm^3/L)),
             ylab = expression(MC~(mu*g/L)))
        incvec <- is.na(bout$yb)
        points(bout$xb[incvec],
               rep(ylim[1] - 0.02*diff(range(ylim)), times = sum(incvec)),
               pch = 21, col ="grey39", bg = "black", cex = 0.6)
        logtick.exp(0.0001, 10, c(1,2), c(F,F))

        polygon(c(x, rev(x)),
                c(predout[,1], rev(predout[,3])),
                col = grey.t, border= NA)
        lines(x, predout[,2])

        ## plot all mc data vs. chl (Fig. 22)
        grey.t <- adjustcolor("grey", alpha.f = 0.5)

        ## simulate mc given chl using model output

        # set up chl values where predicted output will be computed
        # 50 values ranging from minimum to maximum chl
        chlnew <- seq(min(dfsamp$chl.loge),
                      max(dfsamp$chl.loge),length = 50)
        nit <- length(varout$phi)  # number of samples in MCMC sim

        ## store sampled posterior distribution of model parameters
        ## repeat them fac times to increase sample size
        ## for plotting

        f1 <- varout$muf[,1]
        f2 <- varout$muf[,2]
        f3 <- varout$muf[,3]
        sig1 <- varout$sig[,1]
        sig2 <- varout$sig[,2]
        cp <- varout$cp
        d1 <- varout$d[,1]
        d2 <- varout$d[,2]
        d3 <- varout$d[,3]
        sigchl <- varout$sigchl[,2]

        ## set up storage location for predicted mc
        mcpred <- matrix(NA, nrow=length(chlnew), ncol = nit)
        ## compute predicted MC
        for (i in 1:length(chlnew)) {
            ## get error associated with estimating seasonal mean chl
            chlsamp <- rnorm(nit, mean = chlnew[i], sd = sigchl)
            ## predict proportion cyano for each chl
            alpha <- rnorm(nit, mean = f1 + f2*chlsamp + f3*chlsamp^2,
                           sd = sig1)
            ## estimate of cyano abundance
            mn0 <- rnorm(nit, mean = log(plogis(alpha)) + chlsamp,
                         sd = sig2)
            ## predict mean MC given cyano
            y.sc <- d1 + d2*mn0 + (d3 - d2)*mn0*plogis(10*(mn0 - cp))
            # sample from negative binomial to get distribution of
            # observed MC given mean mc at the selected percentile
            mcpred[i,] <- qnbinom(1-p.exceed, mu = exp(y.sc),
                                  size = varout$phi)
        }

        # compute mean, 90th and 99th percentile of predicted distribution
        # of microcysin
        mcmean0 <- apply(mcpred, 1, median)
        ## compute selected credible intervals
        mcup <- apply(mcpred, 1, quantile, prob = 1-(1-credint)*0.5)
        mclo <- apply(mcpred, 1, quantile, prob = (1-credint)*0.5)

        # compute candidate chl criterion
        crit <- approx(mcup, chlnew, thold*10)$y

        dev.new()
        layout(matrix(c(1,2), nrow = 2, ncol = 1),
               heights = c(1, 0.5))
        par(mar = c(0,5,1,1), mgp = c(2.3,1,0))

        ## identify observed microcystin samples that are zero
        ## these needed to be plotted differently in log-transformed coord
        iszero <- dfsamp$mc == 0
        dfsamp$mc[iszero] <- NA
        xlim <- range(c(dfsamp$chl.loge + mnval["chl"]))
        ylim <- c(log(0.1), max(log(mcup*0.1)))

        ymin <- ylim[1] - 0.04*diff(ylim)
        xmin <- xlim[1] - 0.04*diff(xlim)

        ## set zero values of lower CL to ymin so that error bound is drawn
        ## correctly in log-transformed coord
        mclo[mclo ==0] <- exp(ymin)
        ## plotting observed data for grab chl vs mc, but
        ## modeled relationship is based on
        ## seasonal mean chl, because the difference isn't that great.
        ## Effect of seasonal mean chl is manifested in the credible
        ## intervals
        plot(dfsamp$chl.loge + mnval["chl"],
             log(dfsamp$mc*0.1),
             xlab = expression(Chl~italic(a)~(mu*g/L)),
             ylab = expression(MC~(mu*g/L)), axes = F,
             xlim  = xlim, ylim = ylim,  type = "n")
        points(dfsamp$chl.loge + mnval["chl"],
           log(dfsamp$mc*0.1), pch = 21, col = "grey39",
           bg = "white", cex = 0.8)
#            points(bout$xb, log(bout$yb), pch = 21, col = "grey39",
#                   bg = "white")
        logtick.exp(0.0001, 10, c(2), c(F,F))
        tickloc <-  c(seq(0.01, 0.09, 0.01), seq(0.1, 0.9, 0.1),
                      seq(1,9, 1), seq(10,90,10), seq(100,900, 100))
        axis(1, at = log(tickloc), lab = NA, tcl = -0.2)
        axis(1, at = log(c(0.01, 0.1, 1, 10, 100, 1000)), lab = NA)

        # add predicted percentile lines
        polygon(c(chlnew + mnval["chl"], rev(chlnew + mnval["chl"])),
                c(log(mcup*0.1), rev(log(mclo*0.1))), border = NA,
                col = grey.t)
        lines(chlnew + mnval["chl"], log(mcmean0*0.1))

        # draw crit line
        if (!is.na(crit)) {
            segments(xmin, log(thold), crit + mnval["chl"], log(thold),
                     col = "red")
            segments(crit+mnval["chl"], log(thold),
                     crit+mnval["chl"], ymin, col = "red")
        }
        else cat("Criteria could not be calculated.\n")

        ## put plot of non-detects at the bottom
        par(mar = c(4,5,1,1))
        cutp <- quantile(dfsamp$chl.loge, prob = seq(0,1, length = 21))
        cutm <- (cutp[-1] + cutp[-length(cutp)])*0.5
        cutf <- cut(dfsamp$chl.loge, cutp, include.lowest = T)
        pnond <- tapply(as.numeric(iszero), cutf, mean, include.lowest = T)

        plot(cutm + mnval["chl"], pnond, xlim = xlim, axes = F,
             xlab = expression(Chl~italic(a)~(mu*g/L)),
             ylab = "Proportion\n non-detect")
        axis(2)
        logtick.exp(0.001, 10, c(1), c(F,F))

        cat("Criterion value:", round(exp(crit + mnval["chl"]), digits = 1), "\n")

    }

    return()

}

#fitout <- chlrat(dat.merge.all, qa.biov, runmod = T)
#varout <- extract(fitout, pars = c("muf", "sig", "d", "cp", "phi",
#                              "cyano_mean", "sigchl", "chlsite"))
chlrat(dat.merge.all, qa.biov, varout, runmod = F)



