## clean and commented
dochlmod <- function(dat.merge, modat0, fitout = NULL, runmod = T) {

    require(mgcv)
    require(rMR)
    require(mgcv)
    load("modairtemp.rda")  # model of mean air temperature from tempmod.R
    source("binv.R")
    source("logtick.exp.R")

    dat.merge$site.id.c <- factor(dat.merge$site.id.c)

    ## set number of MCMC chains here
    nchains <- 6

    # drop littoral duplicates
    dat.merge <- subset(dat.merge, sample.type == "MICX")
    dat.merge.sav <- dat.merge
    # select variables used in the model
    dat.merge <- dat.merge[, c("site.id.c", "doc.result", "therm.sav", "chl",
                               "domean", "tempmean","tbot",
                               "do.hyp.fail", "do.hyp.pass",
                               "yday.x", "st.nla2012","year", "tmax",
                               "index.lat.dd", "index.lon.dd",
                               "index.site.depth",
                               "elevation", "area.ha", "tmean.pt",
                               "tmin.pt")]

    sitemean <- function(x, site, varname,dolog = F) {
        if (! dolog) {
            xm <- tapply(x, site, mean, na.rm = T)
        }
        else {
            xm <- tapply(x, site, function(x) mean(log(x), na.rm = T))
        }
        return(xm)
    }

    ## compute mean values of lake characteristics and merge
    varlist <- c("doc.result", "index.lat.dd", "tmean.pt", "tmin.pt",
                 "index.lon.dd", "elevation", "area.ha", "index.site.depth")
    matout <- matrix(NA, ncol = length(varlist),
                     nrow = length(levels(dat.merge$site.id.c)))
    dolog <- c(T, rep(F, times = 7))
    for (i in 1:length(varlist)) {
        matout[,i] <- sitemean(dat.merge[, varlist[i]],
                               dat.merge$site.id.c, dolog = dolog[i])
    }
    dat.mean <- data.frame(levels(dat.merge$site.id.c), matout)
    names(dat.mean) <- c("site.id.c", "docmean", "Lat", "Temp", "TempMin",
                         "Lon", "Elev", "Area", "Depth")
    dat.mean$LakeRatio <- (dat.mean$Area*10000)^0.25/dat.mean$Depth

    ## model tempmin and fill in missing values
    df2 <- na.omit(dat.mean[, c("Elev", "Lat", "Lon", "TempMin")])
    mod <- gam(TempMin ~ s(Lat, Lon, k = 50) + Elev, data = df2)
    predout <- predict(mod, dat.mean[, c("Lat", "Lon", "Elev")])
    dat.mean$TempMin <- predout

    dat.merge<-merge(dat.merge, dat.mean,by = "site.id.c")

    ## compute mean depth below thermocline
    dat.merge$hyp.depth <-dat.merge$Depth - dat.merge$therm.sav
    incvec <- dat.merge$hyp.depth < 0
    incvec[is.na(incvec)] <- F
    dat.merge$hyp.depth[incvec] <- NA # drop sites with negative hyp depth
    hypdepth.mn <- tapply(dat.merge$hyp.depth, dat.merge$site.id.c, mean,
                          na.rm = T)
    dftemp <- data.frame(site.id.c = names(hypdepth.mn),
                         hypdepth.mn = as.vector(hypdepth.mn))
    dat.merge <- merge(dat.merge, dftemp, by = "site.id.c")

    ## drop missing data
    incvec <- ! is.na(dat.merge$hyp.depth) & ! is.na(dat.merge$domean) &
        ! is.na(dat.merge$chl) & ! is.na(dat.merge$docmean)
    dat.merge <- dat.merge[incvec,]

    ## set bottom temperature less than 4 to 4 for the purposes
    ## of calculating beginning DO
    incvec <- dat.merge$TempMin < 4
    incvec[is.na(incvec)] <- F
    dat.merge$TempMin[incvec] <- 4

    ## get beginning DO concentration at turnover
    dosat <- DO.saturation(rep(5, times = nrow(dat.merge)),
                           dat.merge$TempMin, elevation.m = dat.merge$Elev,
                           salinity = rep(0, times = nrow(dat.merge)),
                           salinity.units = "pp.thou")

    dat.merge$maxDO <- 5/dosat

    ## select lakes that seasonally stratify based on lake ratio
    dat.merge <- subset(dat.merge, LakeRatio < 3)

    ## drop 7 partial profiles depth > 25 m and number of measurements < 16
    incvec <- (dat.merge$do.hyp.pass + dat.merge$do.hyp.fail < 16) &
        dat.merge$hyp.depth > 25
    dat.merge <- dat.merge[!incvec,]

    ## ND alkaline lake dropped, WA super saturated lake, IN metalimnetic DO max
    ## these are big outliers in the model
    droplist <- c("NLA12_ND-R17", "NLA12_WA-R10", "NLA12_IN-131")
    for (i in droplist) {
        incvec <- dat.merge$site.id.c == i
        if (sum(incvec) == 1){
            dat.merge <- dat.merge[!incvec,]
        }
    }

    ## compute adjusted latitude for finding dimictic lakes
    lat0 <- seq(0, 90, by = 10)
    adj <- c(0.27, 0.31, 0.34, 0.39, 0.46, 0.54, 0.68, 0.89, 1.3, 2.4)
    dat.merge$adjloc <- approx(lat0, adj, dat.merge$Lat)$y
    dat.merge$adjlat <- dat.merge$Lat + dat.merge$adjloc*dat.merge$Elev/100

    dat.merge <- subset(dat.merge, adjlat > 40) # drop monomictic lakes

    ## standardize variables for model
    dat.merge$chl <- log(dat.merge$chl)
    varlist <- c("hypdepth.mn", "docmean", "chl", "yday.x", "Temp")
    mnsav <- apply(dat.merge[, varlist], 2, mean)
    sdsav <- apply(dat.merge[, varlist], 2, sd)

    for (i in varlist) {
        dat.merge[,paste(i, "sc", sep = ".")] <- (dat.merge[,i] - mnsav[i])/sdsav[i]
    }

    dat.merge$site.id.c <- factor(dat.merge$site.id.c)
    dat.merge$sitenum <- as.numeric(dat.merge$site.id.c)

    ## sitedata
    dfsite <- unique.data.frame(dat.merge[, c("sitenum", "hypdepth.mn.sc",
                                           "docmean.sc","maxDO",
                                           "Lat", "Temp.sc")])
    dfsite <- dfsite[order(dfsite$sitenum),]

    # output chl and DOm data
    dffull0 <- dat.merge
#    save(dffull0, file = "dffull0.rda")

    ## split data into missing (DO < 2) or observed
    incvec <- dat.merge$domean < 2
    dftemp1 <- dat.merge[!incvec,]
    dftemp2 <- dat.merge[incvec,]

    df1 <- modat0

    ## get mean air temperature at MO lakes using Lat, Lon, and Elev
    df1$Lon <- df1$Long
    df1$Elev <- df1$elev*0.3048
    df1$Temp <- predict(modairtemp, df1)

    ## standardize MO data by nla vals
    df1$chlmean.sc <- (df1$chlmean - mnsav["chl"])/sdsav["chl"]
    df1$docmean.sc <- (df1$docmean - mnsav["docmean"])/sdsav["docmean"]
    df1$yday.sc <- (df1$yday - mnsav["yday.x"])/sdsav["yday.x"]
    df1$depth.hm.sc<- (df1$depth.hm - mnsav["hypdepth.mn"])/sdsav["hypdepth.mn"]
    df1$Temp.sc <- (df1$Temp - mnsav["Temp"])/sdsav["Temp"]

    ## get state average value for Temp and elev to inform maxDO for MO data
    dftemp.state <- unique.data.frame(dat.merge.all[, c("site.id.c",
                                                        "st.nla2012")])
    dftemp.state <-merge(dftemp.state, dat.mean[, c("site.id.c", "TempMin",
                                                    "Elev")])
    incvec <- dftemp.state$st.nla2012 == "MO"
    mo.mn <- apply(dftemp.state[incvec, c("TempMin", "Elev")], 2, mean)
    dosat.mo <- DO.saturation(5,
                           mo.mn["TempMin"], elevation.m = mo.mn["Elev"],
                           salinity = 0,
                           salinity.units = "pp.thou")
    mo.maxDO <- 5/dosat.mo

    ## set up site ids
    df1$id <- factor(as.character(df1$id))
    df1$sitenum <- as.numeric(df1$id)
    df1$idyear <- factor(df1$idyear)
    df1$siteyrnum <- as.numeric(df1$idyear)

    lookup0 <- unique.data.frame(df1[, c("siteyrnum", "Temp.sc")])
    lookup0 <- lookup0[order(lookup0$siteyrnum),]

    dfsite0 <- unique.data.frame(df1[, c("sitenum", "depth.hm.sc", "chlmean.sc",
                                         "docmean.sc", "Temp.sc")])
    dfsite0 <- dfsite0[order(dfsite0$sitenum),]

    datstan <- list(n = c(nrow(dftemp1), nrow(dftemp2), nrow(dat.merge)),
                    nsite = nrow(dfsite),
                    chl = dat.merge$chl.sc,
                    temp = dfsite$Temp.sc,
                    depth = dfsite$hypdepth.mn.sc,
                    doc = dfsite$docmean.sc,
                    dmax = dfsite$maxDO,
                    t1 = dftemp1$yday.x.sc,
                    t2 = dftemp2$yday.x.sc,
                    domean = dftemp1$domean,
                    sitenum = dat.merge$sitenum,
                    sitenum1 = dftemp1$sitenum,
                    sitenum2 = dftemp2$sitenum,
                    n0 = nrow(df1),
                    t0 = df1$yday.sc,
                    domean0 = df1$domean.all,
                    dmax0 = mo.maxDO,
                    nsite0 = max(dfsite0$sitenum),
                    chl0 = dfsite0$chlmean.sc,
                    temp0 = as.vector(lookup0$Temp.sc),
                    depth0 = dfsite0$depth.hm.sc,
                    doc0 = dfsite0$docmean.sc,
                    sitenum0 = df1$sitenum,
                    nsiteyr0 = max(df1$siteyrnum),
                    siteyrnum0 = df1$siteyrnum)


    print(str(datstan))

    modstan <- '
         data {
             int n[3];                    // N above 2, below 2, and all
             int nsite;                   // number of samples
             vector[n[3]] chl;            // sample Chl

             vector[nsite] temp;          // mean air temperature
             vector[nsite] depth;         // depth below thermocline
             vector[nsite] doc;           // mean doc
             vector[nsite] dmax;          // maximum DO

             vector[n[1]] t1;             // sample day above 2s
             vector[n[2]] t2;             // sample day below 2s
             vector[n[1]] domean;         // mean DO
             int sitenum[n[3]];           // sitenums for different samples
             int sitenum1[n[1]];
             int sitenum2[n[2]];

             int n0;                      // N samples for MO
             vector[n0] t0;               // sampling day for MO
             vector[n0] domean0;          // mean DO for MO
             real dmax0;                  // maximum DO for MO
             int nsite0;                  // N sites
             int nsiteyr0;                // N site-year
             vector[nsiteyr0] temp0;      // Air temperature
             vector[nsite0] chl0;         // Chl
             vector[nsite0] depth0;       // depth below thermocline
             vector[nsite0] doc0;         // mean DOC
             int sitenum0[n0];
             int siteyrnum0[n0];

         }
         parameters {
             real muchl;                       // model for mean site chl
             real<lower =0> sigchl[2];         // variance terms for chl model
             vector[nsite] etachl;

             real mud[4];                      // Coefficients for VOD model
             vector<upper = 2>[n[2]] domean2;  // Estimates for censored DO

             real<upper = 0> b[2];             // Coefficients for strat day model
             real<lower = 0> sigt;             // Variance in strat day model
             vector[nsite] etat;
             vector[nsiteyr0] etat0;
             real<lower = 0> sigt0;

             real<lower = 0> sig;              // variance in predicted DO
             real<lower = 0> sig0;             // variance in predicted DO MO
             real<lower = 0> sigd;            // variance in VOD
             vector[nsite0] etad;
         }
         transformed parameters {
             vector[nsite] chlsite;
             vector[nsite] tstart;
             vector[n[1]] dopred1;
             vector[n[2]] dopred2;
             vector[nsiteyr0] tstart0;

             vector[n0] dopred0;
             vector[nsite0] d2;

             chlsite = muchl + etachl*sigchl[1];

             tstart = b[1] + b[2]*temp + etat*sigt;  // strat day model
             tstart0 = b[1] + b[2]*temp0 + etat0*sigt0;

             // VOD model
             d2 = -exp(mud[1]) + mud[2]*chl0 + mud[3]*depth0+mud[4]*doc0 + etad*sigd;

             // Predictions of mean DO
             dopred0 = dmax0 +d2[sitenum0] .* (t0 - tstart0[siteyrnum0]);
             dopred1 = dmax[sitenum1] + (-exp(mud[1]) + mud[2]*chlsite[sitenum1] +
                            mud[3]*depth[sitenum1] + mud[4]*doc[sitenum1]).*
                            (t1 - tstart[sitenum1]);
             dopred2 = dmax[sitenum2] + (-exp(mud[1]) + mud[2]*chlsite[sitenum2] +
                            mud[3]*depth[sitenum2] + mud[4]*doc[sitenum2]).*
                            (t2 - tstart[sitenum2]);
         }
         model {
             muchl ~ normal(0,3);
             sigchl ~ cauchy(0,3);
             etachl ~ normal(0,1);

             sig ~ cauchy(0,3);

             // Priors for strat day based on Demers and Kalff coef scaled
             // Scaled by centering applied to mean annual temeprature
             b[1] ~ normal(-3.6, 2);
             b[2] ~ normal(-0.71, 0.4);
             // Priors for variance in strat day based on Demers and Kalff
             // scaled
             sigt ~ normal(0.78, 0.3);
             sigt0 ~ normal(0.78, 0.3);

             etat ~ normal(0,1);
             etat0 ~ normal(0,1);

             mud ~ normal(0,2);

             etad ~ normal(0,1);
             sigd ~ cauchy(0,2);

             sig0 ~ cauchy(0,3);

             // sampling statements
             chl ~ normal(chlsite[sitenum], sigchl[2]);
             domean ~ normal(dopred1,  sig);
             domean2 ~ normal(dopred2, sig);
             domean0 ~ normal(dopred0, sig0);

         }
         generated quantities {
             vector[n[1]] dop;     // predictive posterior national
             vector[n0] do0p;      // predictive posterior MO
             for (i in 1:n[1]) dop[i] = normal_rng(dopred1[i], sig);
             for (i in 1:n0) do0p[i] = normal_rng(dopred0[i], sig0);
         }
     '


    if (runmod) {
        require(rstan)
        rstan_options(auto_write = TRUE)
        nchains <- 6
        options(mc.cores = nchains)

        fit <- stan(model_code = modstan, data = datstan,
                    iter = 900, chains = nchains, warmup = 300,
                    control = list(max_treedepth = 15,
                                   adapt_delta = 0.9), thin = 2)

        return(fit)
    }
    else {
        ## plot median posterior prediction vs. observation
        varout <- extract(fitout, pars = c("do0p", "dop"))
        dopmed <- apply(varout$dop, 2, median)
        do0pmed <- apply(varout$do0p, 2, median)
        dev.new()
        par(mar = c(4,4,1,1), mfrow = c(1,2), mgp = c(2.3,1,0),
            bty = "l")
        plot(dopmed, dftemp1$domean, xlab = "Predicted DO",
             ylab = "Observed DO", pch = 21, col = "grey39",
             bg = "white", main = "NLA")
        abline(0,1)
        plot(do0pmed, df1$domean.all, xlab = "Predicted DO",
             ylab = "Observed DO", pch = 21, col= "grey39",
             bg = "white", main = "MO")
        abline(0,1)
    }
}

## runmod variable set to T to run simulation and set to F to
##  run post processing.
fitout <- dochlmod(dat.merge.all, modat0, runmod = T)
## post process
dochlmod(dat.merge.all, modat0, fitout, runmod = F)
