# 12.10.2019: Cleanded and commented
# 8.02.2018: simplified MC model to speed up
chlrat <- function(df1, qa, fitout = NULL, runmod = T) {

    require(rstan)
    require(mgcv)
    source("logtick.exp.R")
    source("binv.R")

    nchains <- 6  # number of concurrent chains

    # make mc into integer times 10
    df1$mc <- round(df1$mc*10)
    df1$mc[is.na(df1$mc)] <- 0

    # drop below detection limit chl
    df1$chl[df1$chl==0] <- NA

    # log transforms
    df1$log.cyano <- log(df1$biov.cyano*1e-6)
    df1$biov.tot <- log(df1$biov.tot*1e-6)
    df1$chl.loge <- log(df1$chl)

    # drop samples missing cyano or chl
    incvec <- !is.na(df1$log.cyano) & ! is.na(df1$chl.loge)
    df1 <- df1[incvec,]

    df1$id2 <- paste(df1$site.id.c, df1$visit.no, sep = "--")

    # outliers: two cyano biov measurements that are very low omitted
    incvec <- df1$log.cyano < -3 & df1$mc > 50
    df1 <- df1[!incvec,]

    # drop HI (not in conterminous US)
    incvec <- df1$st.nla2012 == "HI"
    df1 <- df1[!incvec,]

    df1 <- df1[, c("idnew",  "log.cyano", "prop.cyano", "biov.tot",
                   "chl.loge", "mc",
                   "st.nla2012", "us.l3code")]

    # center chl and biov
    mnval.chl <- mean(df1$chl.loge)
    df1$chl.loge <- df1$chl.loge - mnval.chl
    mnval.biov <- mean(df1$biov.tot)
    df1$biov.tot <- df1$biov.tot - mnval.biov
    df1$log.cyano <- df1$log.cyano - mnval.biov

    # prepare QA data
    qa$log.cyano1 <- log(qa$biov1.cyano*1.e-6)
    qa$log.cyano2 <- log(qa$biov2.cyano*1.e-6)
    qa$prop.cyano2 <- qa$biov2.cyano/qa$biov2
    qa$biov.tot2 <- log(qa$biov2*1.e-6) - mnval.biov

    # only MSU resample is a duplicate so only use that one
    selvec <- ! is.na(qa$log.cyano2)
    qa <- qa[selvec,]
    df2 <- merge(df1, qa[, c("idnew", "log.cyano2", "prop.cyano2", "biov.tot2")],
                 by = "idnew")

    df1b <- df2[, c("idnew", "log.cyano2", "prop.cyano2", "biov.tot2")]
    names(df1b) <- c("idnew", "log.cyano", "prop.cyano", "biov.tot")

    # add in blank columns so that I can rbind QA data with main data
    df1b$chl.loge <- NA
    df1b$mc <- NA
    df1b$st.nla2012 <- NA
    df1b$us.l3code <- NA

    df1 <- rbind(df1, df1b)

    # subsample for model testing
    set.seed(10)
    idu <- unique(df1$idnew)
    selvec <- is.na(df1$mc)
    idp <- df1$idnew[selvec]  # select replicates in subsample
    isamp <- sample(length(idu))
    n <- nrow(df1)
    idpick <- unique(c(idp, idu[isamp[1:round(n*0.20)]]))
    incvec <- rep(F, times = nrow(df1))
    for (i in idpick) incvec <- incvec|df1$idnew == i
    # comment this out to stop subsampling
    #df1 <- df1[incvec,]

    # reset site id factors
    df1$idnew <- factor(df1$idnew)
    df1$sampnum <- as.numeric(df1$idnew)
    w <- regexpr("--", as.character(df1$idnew))
    df1$site.id.c <- substring(as.character(df1$idnew), 1, w-1)
    df1$site.id.c <- factor(df1$site.id.c)

    df1$st.nla2012 <- factor(df1$st.nla2012)
    df1$istate <- as.numeric(df1$st.nla2012)
    df1$us.l3code <- factor(df1$us.l3code)

    # specify dfsamp as samples with mc (not replicate cyano)
    incvec <- ! is.na(df1$mc)
    dfsamp <- df1[incvec,]
    dfsamp <- dfsamp[order(dfsamp$sampnum),]

    mnval <- c(mnval.chl, mnval.biov)
    names(mnval) <- c("chl","biov")

    lookup <- unique.data.frame(dfsamp[, c("sampnum", "site.id.c")])
    lookup$sitenum <- as.numeric(lookup$site.id.c)
    lookup <- lookup[order(lookup$sampnum),]

    datstan <- list(n = nrow(df1),
                    nsamp = nrow(dfsamp),
                    chl = dfsamp$chl.loge,
                    chl2 = dfsamp$chl.loge^2,
                    nsite = max(lookup$sitenum),
                    sitenum = lookup$sitenum,
                    propcyano = round(df1$prop.cyano*100),
                    biov = df1$biov.tot,
                    id = df1$sampnum,
                    mc = dfsamp$mc,
                    nstate = length(levels(dfsamp$st.nla2012)),
                    istate = as.numeric(dfsamp$st.nla2012))

    print(str(datstan))

    modstan <- '
       data {
           int n;                  // total number of samples
           int nsamp;              // number of samples without QA
           vector[nsamp] chl;      // observed log chl
           vector[nsamp] chl2;     // log chl squared
           int nsite;              // number of lakes
           int sitenum[nsamp];     // lookup table for sitenum
           int propcyano[n];       // proportion cyanobacteria
           vector[n] biov;         // total phytoplankton biovolume
           int id[n];              // sample number for full dataset
           int mc[nsamp];          // observed MC
           int nstate;             // number of states
           int istate[nsamp];      // state associated with each sample
       }
       parameters {
          // parameters for chl-cyano relationship
           vector[3] muf;
           real<lower = 0> sig[4];
           vector[nsamp] eta1;
           vector[nsamp] eta2;
           vector[n] eta3;

           // parameters for cyano-mc relationship
           vector[3] d;
           real cp;
           real<lower = 0> phi;

           // parameters for site mean chl
           real<lower = 0> sigchl[2];
           vector[nsite] etachl;
       }
       transformed parameters {
            vector[nsamp] biov_mean;
            vector[nsamp] alpha;
            vector[nsamp] cyano_mean;
            vector[n] alpha2;
            vector[nsamp] mcmean;
            vector[nsite] chlsite;

            chlsite = etachl*sigchl[1];

            biov_mean = chl + eta1*sig[1];
            // quadratic relationship for chl-propcyano
            alpha = muf[1] + muf[2] * chl + muf[3] * chl2 + sig[2]*eta2;
            alpha2 = alpha[id] + sig[3]*eta3;
            // compute mean cyano
            cyano_mean = biov_mean + log(inv_logit(alpha));
            // piecewise linear relationship for MC
            mcmean = d[1] + d[2]*cyano_mean +
                    (d[3]-d[2])*(cyano_mean - cp) .* inv_logit(10*(cyano_mean-cp));
       }

       model {
            // priors
            sigchl ~ cauchy(0,3);
            etachl ~ normal(0,1);
            sig ~ cauchy(0,3);
            eta1 ~ normal(0,1);
            eta2~ normal(0,1);
            eta3 ~ normal(0,1);
            d ~ normal(0,3);
            cp ~ normal(0,1);
            phi ~ normal(0,3);

           // sampling statements
            chl ~ normal(chlsite[sitenum], sigchl[2]);
            biov ~ normal(biov_mean[id], sig[4]);
            propcyano ~ binomial_logit(100, alpha2);
            mc ~ neg_binomial_2_log(mcmean, phi);
       }
    '

    if (runmod) {

        rstan_options(auto_write= TRUE)
        options(mc.cores = nchains)
        fitout <- stan(model_code = modstan, data = datstan,
                       chains = nchains, iter = 500, warmup = 250,
                       control = list(max_treedepth = 15, adapt_delta = 0.9))
        return(fitout)
    }
    else {
        varout <- extract(fitout, pars = c("muf", "sig", "d", "cp", "phi",
                                      "cyano_mean", "sigchl", "chlsite"))

        grey.t <- adjustcolor("grey39", alpha.f = 0.5)

        ### plot chl vs. phytoplankton biovolume
        dev.new()
        #png(width = 6.5, height = 2, pointsize = 8, units = "in", res= 600,
#        file = "figures/all3.png")
        layout(matrix(c(1,2,3), nrow = 1, ncol = 3), widths = c(2, 2.25, 2.25))
        par(mar = c(4,4,1,1), mgp = c(2.3,1,0))
        plot(dfsamp$chl.loge + mnval["chl"], dfsamp$biov.tot + mnval["biov"],
             axes = F,
             xlab = expression(Chl~italic(a)~(mu*g/L)),
             ylab = expression(Phytoplankton~biovolume~(mm^3/L)),
             pch = 21, col = "grey", bg = "white")
        abline(-mnval["chl"]+mnval["biov"], 1)
        logtick.exp(0.0001, 10, c(1,2), c(F,F))

        ### plot chl vs. cyano relative abundance (prop.cyano)
        # prepare prop.cyano for plotting by converting 1s and 0s to
        # values close to 1 and 0, so they can be logit transformed
        incvec <- dfsamp$prop.cyano == 1
        dfsamp$prop.cyano[incvec] <- NA
        maxval <- max(dfsamp$prop.cyano, na.rm = T)
        dfsamp$prop.cyano[incvec] <- 1 - (1-maxval)*0.5
        incvec <- dfsamp$prop.cyano == 0
        dfsamp$prop.cyano[incvec] <- NA
        minval <- min(dfsamp$prop.cyano, na.rm = T)
        dfsamp$prop.cyano[incvec] <- 0.5*minval

        # average prop.cyano in bins of about 20 samples
        bout <- binv(dfsamp$chl.loge,
                     qlogis(dfsamp$prop.cyano), 20)

        # compute posterior predictions of mean relationship
        np <- 80   # posterior predictions at 80 points
        predout <- matrix(NA, ncol = 3, nrow = np)
        xrange <- range(bout$xb)
        x <- seq(xrange[1], xrange[2], length = np)

        for (j in 1:np) {
            y <- varout$muf[,1] + varout$muf[,2]*x[j] +
                varout$muf[,3]*x[j]^2
            predout[j,] <- quantile(y,prob = c(0.05,0.5,0.95))
        }

        plot(bout$xb+mnval["chl"], bout$yb, axes = F,
             pch = 21,  col = "grey39",
             bg = "white", ylab = "Cyanobacteria rel. abund.",
             xlab = expression(Chl~italic(a)~(mu*g/L)))
        logtick.exp(0.001, 10, c(1), c(F,T))
        # vertical axes are logit transformed
        lab0 <- c(0.025, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)
        at0 <- qlogis(lab0)
        axis(2, at = at0, lab = lab0)
        lab0 <- seq(0.025, 0.95, by = 0.025)
        at0 <- qlogis(lab0)
        axis(2, at = at0, lab = NA, tcl = -0.2)
        polygon(c(x, rev(x)) + mnval["chl"],
                c(predout[,1], rev(predout[,3])),
                border= NA, col = grey.t)
        lines(x + mnval["chl"], predout[,2])

        ## plot mean cyano biov vs. MC
        ## compute mean predicted cyano biov
        dfsamp$cyano.mean <- apply(varout$cyano_mean, 2, mean)

        ## mean microcystin in bins of 15
        bout <- binv(dfsamp$cyano.mean, dfsamp$mc, 15)
        bout$yb[bout$yb == 0] <- NA

        ## posterior predictions assuming mean cyano biov is known
        x <- seq(min(bout$xb), max(bout$xb),length = np)
        predout <- matrix(NA, ncol = 3, nrow=length(x))
        for (j in 1:length(x)) {
            mcmean <- varout$d[,1] + varout$d[,2]*x[j] +
                (varout$d[,3]-varout$d[,2])*
                    (x[j] - varout$cp)*as.numeric(x[j] > varout$cp)
            predout[j,] <- quantile(mcmean-log(10), prob = c(0.05, 0.5, 0.95))
        }
        ylim <- range(log(bout$yb*0.1), na.rm = T)
        plot(bout$xb, log(bout$yb*0.1), axes = F,
             pch = 21, col = "grey39", bg = "white",
             xlab = expression(Cyano~biovolume~(mm^3/L)),
             ylab = expression(MC~(mu*g/L)))
        incvec <- is.na(bout$yb)
        points(bout$xb[incvec],
               rep(ylim[1] - 0.02*diff(range(ylim)), times = sum(incvec)),
               pch = 21, col ="grey39", bg = "black", cex = 0.6)
        logtick.exp(0.0001, 10, c(1,2), c(F,F))

        polygon(c(x, rev(x)),
                c(predout[,1], rev(predout[,3])),
                col = grey.t, border= NA)
        lines(x, predout[,2])



    }

}
## runmod variable set to T to run simulation and set to F to
##  run post processing.
fitout <- chlrat(dat.merge.all, qa.biov, runmod = T)
## post processing
chlrat(dat.merge.all, qa.biov, fitout, runmod = F)


