#THIS SCRIPT CALCULATES THE PROBABILITY A SYSTEM HAS TREATMENT CHANGE                	  #
#CREATED BY TOM BENEKE (THE CADMUS GROUP) 04/17/2018									  #
###########################################################################################

#import libraries
library(plyr)
library(stringr)

#set working directory
setwd("C:/Users/thomas.beneke/Desktop/ChangeSourceTreatment_04122018")

#read in source data
#this is a CSV version of Laura's "tblWSF-Treat" table in the "ChangeTreat-QA" Access database
wsf.df <- read.csv("tblWSF-Treat.csv", stringsAsFactors=FALSE, header=FALSE)
colnames(wsf.df) <- c("PWSID","FacilityID","IsSourceInd","Active_YN","FacilityDeadline","DeactiveDate","FacilityType","WaterType","Availability",
						"SellerPWSID","SellerTreatment","TreatProcess","TreatObj","TreatID","TreatCode","TreatPID","WaterTypeCode","ReportedYear","SourceCode")

#read in community water system table
#this is a CSV version of Laura's "tblC_NTNC" table in the "ChangeInTreat-QA" Access database						
cws.df <- read.csv("tblC_NTNC.csv", stringsAsFactors=FALSE, header=TRUE)
cws.df$PWSID <- cws.df$PWS.ID

#read in size category sorting CSV
sort.ord <- read.csv("SortSizeCategories.csv", stringsAsFactors = FALSE, header=TRUE)

#read in pwsids that were already included in source change counts/probabilities
src.chng <- read.csv("Unique_PWSIDs_SourceChange.csv", stringsAsFactors = FALSE, header = TRUE)

#join cws data to wsf dataframe & remove unnecessary columns
wsf.cws <- join(wsf.df, cws.df, by=c('PWSID'), type='left', match='all')
wsf.cws <- wsf.cws[,c("PWSID","FacilityID","PWS.Type","TreatCode","TreatPID","CCT_YN","GW.or.SW","Size.Category","IsSourceInd","ReportedYear","SourceCode","Availability")]

#select only community water systems and add "points" field
wsf.cws <- wsf.cws[wsf.cws$PWS.Type == "Community water system" & wsf.cws$Availability != "Emergency" & is.na(wsf.cws$PWS.Type) == FALSE,]
wsf.cws <- wsf.cws[wsf.cws$PWSID %in% cws.df$PWSID == TRUE,]
wsf.cws$Points <- 1

#add unique identifier for Treat Code ID - Process Code ID combination
wsf.cws$TID_PID <- paste0(wsf.cws$TreatCode,"_",wsf.cws$TreatPID)
wsf.cws <- wsf.cws[wsf.cws$PWSID %in% src.chng$PWSID == FALSE,]

#aggregate counts by PWSID, source type, and year, for each year
yr13 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.cws[wsf.cws$ReportedYear == 2013 & wsf.cws$TreatCode != "-" & wsf.cws$TreatCode != "",], "sum")
yr14 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.cws[wsf.cws$ReportedYear == 2014 & wsf.cws$TreatCode != "-" & wsf.cws$TreatCode != "",], "sum")
yr15 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.cws[wsf.cws$ReportedYear == 2015 & wsf.cws$TreatCode != "-" & wsf.cws$TreatCode != "",], "sum")
yr16 <- aggregate(Points ~ PWSID + ReportedYear, data = wsf.cws[wsf.cws$ReportedYear == 2016 & wsf.cws$TreatCode != "-" & wsf.cws$TreatCode != "",], "sum")

#create empty dataframe to append counts by year to
pws <- data.frame(Obj_ID = seq(1:length(unique(wsf.cws$PWSID))),PWSID = unique(wsf.cws$PWSID))

#merge in each year's counts of wells
pws <- merge(pws,yr13[,c(1,3)],by="PWSID",all=TRUE)
pws <- merge(pws,yr14[,c(1,3)],by="PWSID",all=TRUE)
pws <- merge(pws,yr15[,c(1,3)],by="PWSID",all=TRUE)
pws <- merge(pws,yr16[,c(1,3)],by="PWSID",all=TRUE)

#remove unnecessary columns
pws <- pws[,c(1,3,4,5,6)]
colnames(pws) <- c("PWSID","y13","y14","y15","y16")

#change NAs to 0 for numeric calculations
pws$y13[is.na(pws$y13)==TRUE] <- 0
pws$y14[is.na(pws$y14)==TRUE] <- 0
pws$y15[is.na(pws$y15)==TRUE] <- 0
pws$y16[is.na(pws$y16)==TRUE] <- 0

#identify year sequences with a change
pws$cnt_delta_1314 <- 0
pws$cnt_delta_1415 <- 0
pws$cnt_delta_1516 <- 0
pws$cnt_delta_1314[pws$y14 > pws$y13] <- 1
pws$cnt_delta_1415[pws$y15 > pws$y14] <- 1
pws$cnt_delta_1516[pws$y16 > pws$y15] <- 1

#join size category, source type, and cct designation back into dataframe
pws <- join(pws, cws.df[,c(37,23,8,12)][cws.df$PWS.Type == "Community water system" & is.na(cws.df$PWS.Type) == FALSE,], by=c('PWSID'), type='left', match='all')
pws$CCT_YN[pws$CCT_YN == 1] <- "Y"
pws$CCT_YN[pws$CCT_YN == 0] <- "N"

#remove rows for PWS IDs that were already counted in the source change probabilities
pws <- pws[pws$PWSID %in% src.chng$PWSID == FALSE,]

#write CSV of systems with a treatment change
pws.chng <- pws
pws.chng$changecount <- rowSums(pws.chng[,(6:8)], na.rm=TRUE)
pws.chng <- pws.chng[,c(1,12)][pws.chng$changecount >=1,]
pws.chng <- pws.chng[duplicated(pws.chng)==FALSE,]
write.csv(pws.chng, "Unique_PWSIDs_TreatChange.csv", row.names=FALSE)

#add counts of number of system changes by year
prob1314 <- aggregate(cnt_delta_1314 ~ CCT_YN + Size.Category + GW.or.SW, data = pws, "sum")
prob1415 <- aggregate(cnt_delta_1415 ~ CCT_YN + Size.Category + GW.or.SW, data = pws, "sum")
prob1516 <- aggregate(cnt_delta_1516 ~ CCT_YN + Size.Category + GW.or.SW, data = pws, "sum")
probs <- prob1314
probs$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)] <- prob1415$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)]
probs$cnt_delta_1516[match(probs$CCT_YN,prob1516$CCT_YN) & match(probs$Size.Category,prob1516$Size.Category) & match(probs$GW.or.SW,prob1516$GW.or.SW)] <- prob1516$cnt_delta_1516[match(probs$CCT_YN,prob1516$CCT_YN) & match(probs$Size.Category,prob1516$Size.Category) & match(probs$GW.or.SW,prob1516$GW.or.SW)]

#add counts of all pws systems by size category,cct,and source water type
pws.unique <- cws.df[,c(37,3,8,12,23)][duplicated(cws.df[,c(37,3,8,12,23)])==FALSE & cws.df$PWS.Type == "Community water system",]
colnames(pws.unique) <- c("PWSID","PWS.Type","Size.Category","GW.or.SW","CCT")
pws.unique$CCT[pws.unique$CCT == 0] <- "N"  
pws.unique$CCT[pws.unique$CCT == 1] <- "Y"  
pws.unique$Points <- 1
all.sys <- aggregate(Points ~ CCT + Size.Category + GW.or.SW, data = pws.unique, "sum")
all.sys$UnqID <- paste0(all.sys$CCT,"_",all.sys$Size.Category,"_",all.sys$GW.or.SW)

#add unique id and join to table of all system counts by category
probs$UnqID <- paste0(probs$CCT_YN,"_",probs$Size.Category,"_",probs$GW.or.SW)
probs <- join(probs, all.sys[,c(5,4)], by=c('UnqID'), type='left', match='all')
probs$Points[is.na(probs$Points)==TRUE] <- 0
probs <- probs[,c(1:6,8)]
colnames(probs) <- c("CCT_YN","SizeCat","GW_SW","cnt_delta_1314","cnt_delta_1415","cnt_delta_1516","cnt_all_pws")

#calculate probabilities
probs$prob1314 <- round(probs$cnt_delta_1314/probs$cnt_all_pws,2)
probs$prob1415 <- round(probs$cnt_delta_1415/probs$cnt_all_pws,2)
probs$prob1516 <- round(probs$cnt_delta_1516/probs$cnt_all_pws,2)
probs$prob_avg <- round((probs$prob1314 + probs$prob1415 + probs$prob1516)/3,2)

#sort probabilities according to SDWIS format
probs$SortOrd <- sort.ord$SortOrder[match(probs$SizeCat,sort.ord$SizeCat)]
probs <- probs[order(probs$GW_SW,probs$SortOrd),]

#write to file
write.csv(probs, "Probability_TreatmentChange_041718.csv", row.names=FALSE, na="")
