#THIS SCRIPT CALCULATES THE PROBABILITY THAT A SYSTEM WILL HAVE A SOURCE CHANGE  #
#CREATED BY TOM BENEKE (THE CADMUS GROUP) 09/28/2018							 #
##################################################################################

#import libraries
library(plyr)
library(stringr)

#set working directory
setwd("C:/Users/thomas.beneke/Desktop/SourceTreatment 090418/CWS_Processing")

#read in source data
#this is a CSV version of Laura's "tblWSF-Treat" table in the "ChangeInSource-QA" Access database
wsf.df <- read.csv("tblWSF-Treat.csv", stringsAsFactors=FALSE, header=FALSE)
colnames(wsf.df) <- c("PWSID","FacilityID","IsSourceInd","Active_YN","FacilityDeadline","DeactiveDate","FacilityType","WaterType","Availability",
						"SellerPWSID","SellerTreatment","TreatProcess","TreatObj","TreatID","TreatCode","TreatPID","WaterTypeCode","ReportedYear","SourceCode")

#read in community water system table
#this is a CSV version of Laura's "tblC_NTNC" table in the "ChangeInSource-QA" Access database						
cws.df <- read.csv("tblC_NTNC.csv", stringsAsFactors=FALSE, header=TRUE)
cws.df$PWSID <- cws.df$PWS.ID

#read in size category sorting CSV
sort.ord <- read.csv("SortSizeCategories.csv", stringsAsFactors = FALSE, header=TRUE)

#join cws data to wsf dataframe & remove unnecessary columns
wsf.cws <- join(wsf.df, cws.df, by=c('PWSID'), type='left', match='all')
wsf.cws <- wsf.cws[,c("PWSID","FacilityID","PWS.Type","CCT_YN","GW.or.SW","Size.Category","IsSourceInd","ReportedYear","SourceCode","Availability")]

#select only community water systems and add "points" field
#then subset for only systems in the current CWS inventory
#that are NOT emergency sources
wsf.cws.ind <- wsf.cws[wsf.cws$PWS.Type == "Community water system" & wsf.cws$Availability != "Emergency" & wsf.cws$IsSourceInd == "Y" & is.na(wsf.cws$PWS.Type) == FALSE,]
wsf.cws.ind <- wsf.cws.ind[wsf.cws.ind$PWSID %in% cws.df$PWSID == TRUE,]
wsf.cws.ind$Points <- 1
#subset so that only grabs unique facilities
wsf.cws.ind <- wsf.cws.ind[duplicated(wsf.cws.ind)==FALSE,]
#add unique identifier for PWSID - SourceCode combination
wsf.cws.ind$PWSID_SC <- paste0(wsf.cws.ind$PWSID,"_",wsf.cws.ind$SourceCode)
wsf.cws.ind$CCT_YN[wsf.cws.ind$CCT_YN == 1] <- "Y"
wsf.cws.ind$CCT_YN[wsf.cws.ind$CCT_YN == 0] <- "N"

#aggregate counts by PWSID, source type, and year, for each year
yr12 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2012,], "sum")
yr13 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2013,], "sum")
yr14 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2014,], "sum")
yr15 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2015,], "sum")

#create empty dataframe to append counts by year to
pws <- data.frame(Obj_ID = seq(1:length(unique(wsf.cws.ind$PWSID_SC))),PWSID_SC = unique(wsf.cws.ind$PWSID_SC))

#merge in each year's counts of wells
pws <- merge(pws,yr12[,c(1,3)],by="PWSID_SC",all=TRUE)
pws <- merge(pws,yr13[,c(1,3)],by="PWSID_SC",all=TRUE)
pws <- merge(pws,yr14[,c(1,3)],by="PWSID_SC",all=TRUE)
pws <- merge(pws,yr15[,c(1,3)],by="PWSID_SC",all=TRUE)

#remove unnecessary columns
pws <- pws[,c(1,3,4,5,6)]
colnames(pws) <- c("PWSID_SC","y12","y13","y14","y15")

#change NAs to 0 for numeric calculations
pws$y12[is.na(pws$y12)==TRUE] <- 0
pws$y13[is.na(pws$y13)==TRUE] <- 0
pws$y14[is.na(pws$y14)==TRUE] <- 0
pws$y15[is.na(pws$y15)==TRUE] <- 0

#remove rows where all years are not equal
pws <- pws[pws$y12 != pws$y13 | pws$y12 != pws$y14 | pws$y12 != pws$y15 | pws$y13 != pws$y14 | pws$y13 != pws$y15 | pws$y13 != pws$y12 | pws$y14 != pws$y15 | pws$y14 != pws$y16 | pws$y14 != pws$y12,]

#split id to parse out PWS ID 
pws <- cbind(pws,data.frame(str_split_fixed(pws$PWSID_SC, "_", 2)))
colnames(pws) <- c("PWSID_SC","y12","y13","y14","y15","PWSID","SourceCode")

#join size category, source type, and cct designation back into dataframe
pws <- join(pws, cws.df[,c(37,23,8,12)][cws.df$PWS.Type == "Community water system" & is.na(cws.df$PWS.Type) == FALSE,], by=c('PWSID'), type='left', match='all')
pws$CCT_YN[pws$CCT_YN == 1] <- "Y"
pws$CCT_YN[pws$CCT_YN == 0] <- "N"

#identify year sequences with an additional source ADDED
pws$cnt_delta_1213 <- 0
pws$cnt_delta_1314 <- 0
pws$cnt_delta_1415 <- 0
pws$cnt_delta_1213[pws$y13 > pws$y12] <- 1
pws$cnt_delta_1314[pws$y14 > pws$y13] <- 1
pws$cnt_delta_1415[pws$y15 > pws$y14] <- 1

#remove duplicates
pws <- pws[,c(6,8:13)]
pws <- pws[duplicated(pws)==FALSE,]

#loop through and remove duplicate PWS system IDs while preserving year counts of source code matches
for (i in unique(pws$PWSID)){
	tmp <- pws[pws$PWSID == i,]
	if(nrow(tmp)>1){	
	tmp <- rbind(tmp,data.frame(PWSID = i,CCT_YN = tmp[1,2],Size.Category = tmp[1,3],GW.or.SW = tmp[1,4],cnt_delta_1314 = sum(tmp[,5]),cnt_delta_1415 = sum(tmp[,6]),cnt_delta_1516 = sum(tmp[,7])))
	}
	tmp <- tmp[nrow(tmp),]
	tmp[,5][tmp[,5] > 1] <- 1
	tmp[,6][tmp[,6] > 1] <- 1
	tmp[,7][tmp[,7] > 1] <- 1
	if(exists("pws.out")){pws.out<-rbind(pws.out,tmp)} else{(pws.out<-tmp)}
}

#write csv of unique PWS IDs with source changes
#this is used in the treatment change R script to remove those with a source change from the treatment counts
write.csv(pws.out, "Unique_PWSIDs_SourceChange.csv", row.names = FALSE)

#get counts of total unique systems by category
pws.unique <- cws.df[,c(37,3,8,12,23)][duplicated(cws.df[,c(37,3,8,12,23)])==FALSE & cws.df$PWS.Type == "Community water system",]
colnames(pws.unique) <- c("PWSID","PWS.Type","Size.Category","GW.or.SW","CCT")
pws.unique$CCT[pws.unique$CCT == 0] <- "N"  
pws.unique$CCT[pws.unique$CCT == 1] <- "Y"  
pws.unique$Points <- 1
all.sys <- aggregate(Points ~ CCT + Size.Category + GW.or.SW, data = pws.unique, "sum")
all.sys$UnqID <- paste0(all.sys$CCT,"_",all.sys$Size.Category,"_",all.sys$GW.or.SW)

#add counts of number of system changes by year
prob1213 <- aggregate(cnt_delta_1213 ~ CCT_YN + Size.Category + GW.or.SW, data = pws.out, "sum")
prob1314 <- aggregate(cnt_delta_1314 ~ CCT_YN + Size.Category + GW.or.SW, data = pws.out, "sum")
prob1415 <- aggregate(cnt_delta_1415 ~ CCT_YN + Size.Category + GW.or.SW, data = pws.out, "sum")
probs <- prob1213
probs$cnt_delta_1314[match(probs$CCT_YN,prob1314$CCT_YN) & match(probs$Size.Category,prob1314$Size.Category) & match(probs$GW.or.SW,prob1314$GW.or.SW)] <- prob1314$cnt_delta_1314[match(probs$CCT_YN,prob1314$CCT_YN) & match(probs$Size.Category,prob1314$Size.Category) & match(probs$GW.or.SW,prob1314$GW.or.SW)]
probs$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)] <- prob1516$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)]

#add unique id and join to table of all system counts by category
probs$UnqID <- paste0(probs$CCT_YN,"_",probs$Size.Category,"_",probs$GW.or.SW)
probs <- join(probs, all.sys[,c(5,4)], by=c('UnqID'), type='left', match='all')
probs$Points[is.na(probs$Points)==TRUE] <- 0
probs <- probs[,c(1:6,8)]
colnames(probs) <- c("CCT_YN","SizeCat","GW_SW","cnt_delta_1213","cnt_delta_1314","cnt_delta_1415","cnt_all_pws")

#calculate probabilities
probs$prob1213 <- round(probs$cnt_delta_1213/probs$cnt_all_pws,2)
probs$prob1314 <- round(probs$cnt_delta_1314/probs$cnt_all_pws,2)
probs$prob1415 <- round(probs$cnt_delta_1415/probs$cnt_all_pws,2)
probs$prob_avg <- round((probs$prob1213 + probs$prob1314 + probs$prob1415)/3,2)

#sort probabilities according to SDWIS format
probs$SortOrd <- sort.ord$SortOrder[match(probs$SizeCat,sort.ord$SizeCat)]
probs <- probs[order(probs$GW_SW,probs$SortOrd),]

#write to file
write.csv(probs, "Probability_SourceChange_092818.csv", row.names=FALSE, na="")