#THIS SCRIPT CALCULATES THE PROBABILITY THAT A SYSTEM WILL HAVE A SOURCE CHANGE  #
#FOR NTNCWS ONLY																 #
#CREATED BY TOM BENEKE (THE CADMUS GROUP) 05/30/2018							 #
##################################################################################

#import libraries
library(plyr)
library(stringr)

#set working directory
setwd("C:/Users/thomas.beneke/Desktop/ChangeSourceTreatment_05302018 - NTNCWS")

#read in source data
#this is a CSV version of Laura's "tblWSF-Treat" table in the "ChangeInSource-QA" Access database
wsf.df <- read.csv("tblWSF-Treat.csv", stringsAsFactors=FALSE, header=FALSE)
colnames(wsf.df) <- c("PWSID","FacilityID","IsSourceInd","Active_YN","FacilityDeadline","DeactiveDate","FacilityType","WaterType","Availability",
						"SellerPWSID","SellerTreatment","TreatProcess","TreatObj","TreatID","TreatCode","TreatPID","WaterTypeCode","ReportedYear","SourceCode")

#read in community water system table
#this is a CSV version of Laura's "tblC_NTNC" table in the "ChangeInSource-QA" Access database						
ntncws.df <- read.csv("tblC_NTNC.csv", stringsAsFactors=FALSE, header=TRUE)
ntncws.df$PWSID <- ntncws.df$PWS.ID

#read in size category sorting CSV
sort.ord <- read.csv("SortSizeCategories.csv", stringsAsFactors = FALSE, header=TRUE)

#join ntncws data to wsf dataframe & remove unnecessary columns
wsf.ntncws <- join(wsf.df, ntncws.df, by=c('PWSID'), type='left', match='all')
wsf.ntncws <- wsf.ntncws[,c("PWSID","FacilityID","PWS.Type","CCT_YN","GW.or.SW","Size.Category","IsSourceInd","ReportedYear","SourceCode","Availability")]

#select only non-community non-transient water systems and add "points" field
#then subset for only systems in the current NTNCWS inventory
#that are NOT emergency sources
wsf.ntncws.ind <- wsf.ntncws[wsf.ntncws$PWS.Type == "Non-Transient non-community system" & wsf.ntncws$Availability != "Emergency" & wsf.ntncws$IsSourceInd == "Y" & is.na(wsf.ntncws$PWS.Type) == FALSE,]
wsf.ntncws.ind <- wsf.ntncws.ind[wsf.ntncws.ind$PWSID %in% ntncws.df$PWSID == TRUE,]
wsf.ntncws.ind$Points <- 1
#subset so that only grabs unique facilities
wsf.ntncws.ind <- wsf.ntncws.ind[duplicated(wsf.ntncws.ind)==FALSE,]
#add unique identifier for PWSID - SourceCode combination
wsf.ntncws.ind$PWSID_SC <- paste0(wsf.ntncws.ind$PWSID,"_",wsf.ntncws.ind$SourceCode)
wsf.ntncws.ind$CCT_YN[wsf.ntncws.ind$CCT_YN == 1] <- "Y"
wsf.ntncws.ind$CCT_YN[wsf.ntncws.ind$CCT_YN == 0] <- "N"

#aggregate counts by PWSID, source type, and year, for each year
yr13 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.ntncws.ind[wsf.ntncws.ind$ReportedYear == 2013,], "sum")
yr14 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.ntncws.ind[wsf.ntncws.ind$ReportedYear == 2014,], "sum")
yr15 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.ntncws.ind[wsf.ntncws.ind$ReportedYear == 2015,], "sum")
yr16 <- aggregate(Points ~ PWSID_SC + ReportedYear, data = wsf.ntncws.ind[wsf.ntncws.ind$ReportedYear == 2016,], "sum")

#create empty dataframe to append counts by year to
pws <- data.frame(Obj_ID = seq(1:length(unique(wsf.ntncws.ind$PWSID_SC))),PWSID_SC = unique(wsf.ntncws.ind$PWSID_SC))

#merge in each year's counts of wells
pws <- merge(pws,yr13[,c(1,3)],by="PWSID_SC",all=TRUE)
pws <- merge(pws,yr14[,c(1,3)],by="PWSID_SC",all=TRUE)
pws <- merge(pws,yr15[,c(1,3)],by="PWSID_SC",all=TRUE)
pws <- merge(pws,yr16[,c(1,3)],by="PWSID_SC",all=TRUE)

#remove unnecessary columns
pws <- pws[,c(1,3,4,5,6)]
colnames(pws) <- c("PWSID_SC","y13","y14","y15","y16")

#change NAs to 0 for numeric calculations
pws$y13[is.na(pws$y13)==TRUE] <- 0
pws$y14[is.na(pws$y14)==TRUE] <- 0
pws$y15[is.na(pws$y15)==TRUE] <- 0
pws$y16[is.na(pws$y16)==TRUE] <- 0

#remove rows where all years are not equal
pws <- pws[pws$y13 != pws$y14 | pws$y13 != pws$y15 | pws$y13 != pws$y16 | pws$y14 != pws$y13 | pws$y14 != pws$y15 | pws$y14 != pws$y16 | pws$y15 != pws$y14 | pws$y15 != pws$y14 | pws$y15 != pws$y16,]

#split id to parse out PWS ID 
pws <- cbind(pws,data.frame(str_split_fixed(pws$PWSID_SC, "_", 2)))
colnames(pws) <- c("PWSID_SC","y13","y14","y15","y16","PWSID","SourceCode")

#join size category, source type, and cct designation back into dataframe
pws <- join(pws, ntncws.df[,c(37,23,8,12)][ntncws.df$PWS.Type == "Non-Transient non-community system" & is.na(ntncws.df$PWS.Type) == FALSE,], by=c('PWSID'), type='left', match='all')
pws$CCT_YN[pws$CCT_YN == 1] <- "Y"
pws$CCT_YN[pws$CCT_YN == 0] <- "N"

#identify year sequences with an additional source ADDED
pws$cnt_delta_1314 <- 0
pws$cnt_delta_1415 <- 0
pws$cnt_delta_1516 <- 0
pws$cnt_delta_1314[pws$y14 > pws$y13] <- 1
pws$cnt_delta_1415[pws$y15 > pws$y14] <- 1
pws$cnt_delta_1516[pws$y16 > pws$y15] <- 1

#remove duplicates
pws <- pws[,c(6,8:13)]
pws <- pws[duplicated(pws)==FALSE,]

#loop through and remove duplicate PWS system IDs while preserving year counts of source code matches
for (i in unique(pws$PWSID)){
	tmp <- pws[pws$PWSID == i,]
	if(nrow(tmp)>1){	
	tmp <- rbind(tmp,data.frame(PWSID = i,CCT_YN = tmp[1,2],Size.Category = tmp[1,3],GW.or.SW = tmp[1,4],cnt_delta_1314 = sum(tmp[,5]),cnt_delta_1415 = sum(tmp[,6]),cnt_delta_1516 = sum(tmp[,7])))
	}
	tmp <- tmp[nrow(tmp),]
	tmp[,5][tmp[,5] > 1] <- 1
	tmp[,6][tmp[,6] > 1] <- 1
	tmp[,7][tmp[,7] > 1] <- 1
	if(exists("pws.out")){pws.out<-rbind(pws.out,tmp)} else{(pws.out<-tmp)}
}

#write csv of unique PWS IDs with source changes
#this is used in the treatment change R script to remove those with a source change from the treatment counts
write.csv(pws.out, "Unique_PWSIDs_SourceChange.csv", row.names = FALSE)

#get counts of total unique systems by category
pws.unique <- ntncws.df[,c(37,3,8,12,23)][duplicated(ntncws.df[,c(37,3,8,12,23)])==FALSE & ntncws.df$PWS.Type == "Non-Transient non-community system",]
colnames(pws.unique) <- c("PWSID","PWS.Type","Size.Category","GW.or.SW","CCT")
pws.unique$CCT[pws.unique$CCT == 0] <- "N"  
pws.unique$CCT[pws.unique$CCT == 1] <- "Y"  
pws.unique$Points <- 1
all.sys <- aggregate(Points ~ CCT + Size.Category + GW.or.SW, data = pws.unique, "sum")
all.sys$UnqID <- paste0(all.sys$CCT,"_",all.sys$Size.Category,"_",all.sys$GW.or.SW)

#add counts of number of system changes by year
prob1314 <- aggregate(cnt_delta_1314 ~ CCT_YN + Size.Category + GW.or.SW, data = pws.out, "sum")
prob1415 <- aggregate(cnt_delta_1415 ~ CCT_YN + Size.Category + GW.or.SW, data = pws.out, "sum")
prob1516 <- aggregate(cnt_delta_1516 ~ CCT_YN + Size.Category + GW.or.SW, data = pws.out, "sum")
probs <- prob1314
probs$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)] <- prob1415$cnt_delta_1415[match(probs$CCT_YN,prob1415$CCT_YN) & match(probs$Size.Category,prob1415$Size.Category) & match(probs$GW.or.SW,prob1415$GW.or.SW)]
probs$cnt_delta_1516[match(probs$CCT_YN,prob1516$CCT_YN) & match(probs$Size.Category,prob1516$Size.Category) & match(probs$GW.or.SW,prob1516$GW.or.SW)] <- prob1516$cnt_delta_1516[match(probs$CCT_YN,prob1516$CCT_YN) & match(probs$Size.Category,prob1516$Size.Category) & match(probs$GW.or.SW,prob1516$GW.or.SW)]

#add unique id and join to table of all system counts by category
probs$UnqID <- paste0(probs$CCT_YN,"_",probs$Size.Category,"_",probs$GW.or.SW)
probs <- join(probs, all.sys[,c(5,4)], by=c('UnqID'), type='left', match='all')
probs$Points[is.na(probs$Points)==TRUE] <- 0
probs <- probs[,c(1:6,8)]
colnames(probs) <- c("CCT_YN","SizeCat","GW_SW","cnt_delta_1314","cnt_delta_1415","cnt_delta_1516","cnt_all_pws")

#calculate probabilities
probs$prob1314 <- round(probs$cnt_delta_1314/probs$cnt_all_pws,2)
probs$prob1415 <- round(probs$cnt_delta_1415/probs$cnt_all_pws,2)
probs$prob1516 <- round(probs$cnt_delta_1516/probs$cnt_all_pws,2)
probs$prob_avg <- round((probs$prob1314 + probs$prob1415 + probs$prob1516)/3,2)

#sort probabilities according to SDWIS format
probs$SortOrd <- sort.ord$SortOrder[match(probs$SizeCat,sort.ord$SizeCat)]
probs <- probs[order(probs$GW_SW,probs$SortOrd),]

#write to file
write.csv(probs, "Probability_SourceChange_NTNCWS_070218.csv", row.names=FALSE, na="")