#THIS SCRIPT CALCULATES THE PROBABILITY A SYSTEM HAS TREATMENT CHANGE                	  #
#FOR NTNCWS ONLY                                                                          #
#CREATED BY TOM BENEKE (THE CADMUS GROUP) 05/30/2018									  #
###########################################################################################

#import libraries
library(plyr)
library(stringr)

#set working directory
setwd("C:/Users/thomas.beneke/Desktop/ChangeSourceTreatment_05302018 - NTNCWS")

#read in source data
#this is a CSV version of Laura's "tblWSF-Treat" table in the "ChangeTreat-QA" Access database
wsf.df <- read.csv("tblWSF-Treat.csv", stringsAsFactors=FALSE, header=FALSE)
colnames(wsf.df) <- c("PWSID","FacilityID","IsSourceInd","Active_YN","FacilityDeadline","DeactiveDate","FacilityType","WaterType","Availability",
						"SellerPWSID","SellerTreatment","TreatProcess","TreatObj","TreatID","TreatCode","TreatPID","WaterTypeCode","ReportedYear","SourceCode")

#read in community water system table
#this is a CSV version of Laura's "tblC_NTNC" table in the "ChangeInTreat-QA" Access database						
ntncws.df <- read.csv("tblC_NTNC.csv", stringsAsFactors=FALSE, header=TRUE)
ntncws.df$PWSID <- ntncws.df$PWS.ID

#read in size category sorting CSV
sort.ord <- read.csv("SortSizeCategories.csv", stringsAsFactors = FALSE, header=TRUE)

#read in pwsids that were already included in source change counts/probabilities
src.chng <- read.csv("Unique_PWSIDs_SourceChange.csv", stringsAsFactors = FALSE, header = TRUE)

#join cws data to wsf dataframe & remove unnecessary columns
wsf.ntncws <- join(wsf.df, ntncws.df, by=c('PWSID'), type='left', match='all')
wsf.ntncws <- wsf.ntncws[,c("PWSID","FacilityID","PWS.Type","TreatCode","TreatPID","CCT_YN","GW.or.SW","Size.Category","IsSourceInd","ReportedYear","SourceCode","Availability")]

#select only non-transient non-community water systems and add "points" field
wsf.ntncws <- wsf.ntncws[wsf.ntncws$PWS.Type == "Non-Transient non-community system" & wsf.ntncws$Availability != "Emergency" & is.na(wsf.ntncws$PWS.Type) == FALSE,]
wsf.ntncws <- wsf.ntncws[wsf.ntncws$PWSID %in% ntncws.df$PWSID == TRUE,]
wsf.ntncws$Points <- 1

#add unique identifier for Treat Code ID - Process Code ID combination
wsf.ntncws$TID_PID <- paste0(wsf.ntncws$TreatCode,"_",wsf.ntncws$TreatPID)
wsf.ntncws <- wsf.ntncws[wsf.ntncws$PWSID %in% src.chng$PWSID == FALSE,]

#aggregate counts by PWSID, source type, and year, for each year
yr13 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.ntncws[wsf.ntncws$ReportedYear == 2013 & wsf.ntncws$TreatCode != "-" & wsf.ntncws$TreatCode != "",], "sum")
yr14 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.ntncws[wsf.ntncws$ReportedYear == 2014 & wsf.ntncws$TreatCode != "-" & wsf.ntncws$TreatCode != "",], "sum")
yr15 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.ntncws[wsf.ntncws$ReportedYear == 2015 & wsf.ntncws$TreatCode != "-" & wsf.ntncws$TreatCode != "",], "sum")
yr16 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.ntncws[wsf.ntncws$ReportedYear == 2016 & wsf.ntncws$TreatCode != "-" & wsf.ntncws$TreatCode != "",], "sum")

#create empty dataframe to append counts by year to
pws <- data.frame(Obj_ID = seq(1:length(unique(wsf.ntncws$PWSID))),PWSID = unique(wsf.ntncws$PWSID))

#merge in each year's counts of wells
pws <- merge(pws,yr13[,c(1,3)],by="PWSID",all=TRUE)
pws <- merge(pws,yr14[,c(1,3)],by="PWSID",all=TRUE)
pws <- merge(pws,yr15[,c(1,3)],by="PWSID",all=TRUE)
pws <- merge(pws,yr16[,c(1,3)],by="PWSID",all=TRUE)

#remove unnecessary columns
pws <- pws[,c(1,3,4,5,6)]
colnames(pws) <- c("PWSID","y13","y14","y15","y16")

#change NAs to 0 for numeric calculations
pws$y13[is.na(pws$y13)==TRUE] <- 0
pws$y14[is.na(pws$y14)==TRUE] <- 0
pws$y15[is.na(pws$y15)==TRUE] <- 0
pws$y16[is.na(pws$y16)==TRUE] <- 0

#identify year sequences with a change
pws$cnt_delta_1314 <- 0
pws$cnt_delta_1415 <- 0
pws$cnt_delta_1516 <- 0
pws$cnt_delta_1314[pws$y14 > pws$y13] <- 1
pws$cnt_delta_1415[pws$y15 > pws$y14] <- 1
pws$cnt_delta_1516[pws$y16 > pws$y15] <- 1

#join size category, source type, and cct designation back into dataframe
pws <- join(pws, ntncws.df[,c(37,23,8,12)][ntncws.df$PWS.Type == "Non-Transient non-community system" & is.na(ntncws.df$PWS.Type) == FALSE,], by=c('PWSID'), type='left', match='all')
pws$CCT_YN[pws$CCT_YN == 1] <- "Y"
pws$CCT_YN[pws$CCT_YN == 0] <- "N"

#remove rows for PWS IDs that were already counted in the source change probabilities
pws <- pws[pws$PWSID %in% src.chng$PWSID == FALSE,]

#write CSV of systems with a treatment change
pws.chng <- pws
pws.chng$changecount <- rowSums(pws.chng[,(6:8)], na.rm=TRUE)
pws.chng <- pws.chng[,c(1,12)][pws.chng$changecount >=1,]
pws.chng <- pws.chng[duplicated(pws.chng)==FALSE,]
write.csv(pws.chng, "Unique_PWSIDs_TreatChange.csv", row.names=FALSE)

#add counts of number of system changes by year
prob1314 <- aggregate(cnt_delta_1314 ~ CCT_YN + Size.Category + GW.or.SW, data = pws, "sum")
prob1415 <- aggregate(cnt_delta_1415 ~ CCT_YN + Size.Category + GW.or.SW, data = pws, "sum")
prob1516 <- aggregate(cnt_delta_1516 ~ CCT_YN + Size.Category + GW.or.SW, data = pws, "sum")
probs <- prob1314
probs$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)] <- prob1415$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)]
probs$cnt_delta_1516[match(probs$CCT_YN,prob1516$CCT_YN) & match(probs$Size.Category,prob1516$Size.Category) & match(probs$GW.or.SW,prob1516$GW.or.SW)] <- prob1516$cnt_delta_1516[match(probs$CCT_YN,prob1516$CCT_YN) & match(probs$Size.Category,prob1516$Size.Category) & match(probs$GW.or.SW,prob1516$GW.or.SW)]

#add counts of all pws systems by size category,cct,and source water type
pws.unique <- ntncws.df[,c(37,3,8,12,23)][duplicated(ntncws.df[,c(37,3,8,12,23)])==FALSE & ntncws.df$PWS.Type == "Non-Transient non-community system",]
colnames(pws.unique) <- c("PWSID","PWS.Type","Size.Category","GW.or.SW","CCT")
pws.unique$CCT[pws.unique$CCT == 0] <- "N"  
pws.unique$CCT[pws.unique$CCT == 1] <- "Y"  
pws.unique$Points <- 1
all.sys <- aggregate(Points ~ CCT + Size.Category + GW.or.SW, data = pws.unique, "sum")
all.sys$UnqID <- paste0(all.sys$CCT,"_",all.sys$Size.Category,"_",all.sys$GW.or.SW)

#add unique id and join to table of all system counts by category
probs$UnqID <- paste0(probs$CCT_YN,"_",probs$Size.Category,"_",probs$GW.or.SW)
probs <- join(probs, all.sys[,c(5,4)], by=c('UnqID'), type='left', match='all')
probs$Points[is.na(probs$Points)==TRUE] <- 0
probs <- probs[,c(1:6,8)]
colnames(probs) <- c("CCT_YN","SizeCat","GW_SW","cnt_delta_1314","cnt_delta_1415","cnt_delta_1516","cnt_all_pws")

#calculate probabilities
probs$prob1314 <- round(probs$cnt_delta_1314/probs$cnt_all_pws,2)
probs$prob1415 <- round(probs$cnt_delta_1415/probs$cnt_all_pws,2)
probs$prob1516 <- round(probs$cnt_delta_1516/probs$cnt_all_pws,2)
probs$prob_avg <- round((probs$prob1314 + probs$prob1415 + probs$prob1516)/3,2)

#sort probabilities according to SDWIS format
probs$SortOrd <- sort.ord$SortOrder[match(probs$SizeCat,sort.ord$SizeCat)]
probs <- probs[order(probs$GW_SW,probs$SortOrd),]

#write to file
write.csv(probs, "Probability_TreatmentChange__NTNCWS_053018.csv", row.names=FALSE, na="")
