
#THIS SCRIPT CALCULATES THE PROBABILITY THAT A SYSTEM WILL HAVE A SOURCE CHANGE  # 
#FROM GROUNDWATER TO SURFACE WATER AND VICE VERSA                                #
#CREATED BY TOM BENEKE (THE CADMUS GROUP) 06/22/2017							 #
##################################################################################

#import libraries
library(plyr)
library(stringr)

#set working directory
setwd("C:/Users/thomas.beneke/Documents/ChangeTreatSourceQA/TomCalculation")

#read in source data
#this is a CSV version of Laura's "tblWSF-Treat" table in the "ChangeInSource-QA" Access database
wsf.df <- read.csv("tblWSF-Treat.csv", stringsAsFactors=FALSE, header=FALSE)
colnames(wsf.df) <- c("PWSID","FacilityID","IsSourceInd","Active_YN","FacilityDeadline","DeactiveDate","FacilityType","WaterType","Availability",
						"SellerPWSID","SellerTreatment","TreatProcess","TreatObj","TreatID","TreatCode","TreatPID","WaterTypeCode","ReportedYear","SourceCode")

#read in community water system table
#this is a CSV version of Laura's "tblC_NTNC" table in the "ChangeInSource-QA" Access database						
cws.df <- read.csv("tblC_NTNC.csv", stringsAsFactors=FALSE, header=TRUE)
cws.df$PWSID <- cws.df$PWS.ID

#join cws data to wsf dataframe & remove unnecessary columns
wsf.cws <- join(wsf.df, cws.df, by=c('PWSID'), type='left', match='all')
wsf.cws <- wsf.cws[,c("PWSID","FacilityID","PWS.Type","CCT_YN","GW.or.SW","Size.Category","IsSourceInd","ReportedYear","SourceCode")]

#select only community water systems and add "points" field
#then subset for only systems in the current CWS inventory
wsf.cws.ind <- wsf.cws[wsf.cws$PWS.Type == "Community water system" & wsf.cws$IsSourceInd == "Y" & is.na(wsf.cws$PWS.Type) == FALSE,]
wsf.cws.ind <- wsf.cws.ind[wsf.cws.ind$PWSID %in% cws.df$PWSID == TRUE,]
wsf.cws.ind$Points <- 1
#subset so that only grabs unique facilities
wsf.cws.ind <- wsf.cws.ind[duplicated(wsf.cws.ind)==FALSE,]
#add unique identifier for PWSID - Facility combination
wsf.cws.ind$PWSID_FAC <- paste0(wsf.cws.ind$PWSID,"_",wsf.cws.ind$FacilityID)
wsf.cws.ind$CCT_YN[wsf.cws.ind$CCT_YN == 1] <- "Y"
wsf.cws.ind$CCT_YN[wsf.cws.ind$CCT_YN == 0] <- "N"

#aggregate counts by PWSID, source type, and year, for each year
yr13 <- aggregate(Points ~ PWSID_FAC + SourceCode + ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2013,], "sum")
yr14 <- aggregate(Points ~ PWSID_FAC + SourceCode + ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2014,], "sum")
yr15 <- aggregate(Points ~ PWSID_FAC + SourceCode+ ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2015,], "sum")
yr16 <- aggregate(Points ~ PWSID_FAC + SourceCode+ ReportedYear, data = wsf.cws.ind[wsf.cws.ind$ReportedYear == 2016,], "sum")

#create empty dataframe to append counts by year to
pws <- data.frame(Obj_ID = seq(1:length(unique(wsf.cws.ind$PWSID_FAC))),PWSID_FAC = unique(wsf.cws.ind$PWSID_FAC))

pws$yr13 <- "-"
pws$yr14 <- "-"
pws$yr15 <- "-"
pws$yr16 <- "-"

#MATCH GWCA
pws$yr13[pws$PWSID_FAC %in% yr13$PWSID_FAC[yr13$SourceCode == "GWCA"]==TRUE] <- "GWCA"
pws$yr14[pws$PWSID_FAC %in% yr14$PWSID_FAC[yr14$SourceCode == "GWCA"]==TRUE] <- "GWCA"
pws$yr15[pws$PWSID_FAC %in% yr15$PWSID_FAC[yr15$SourceCode == "GWCA"]==TRUE] <- "GWCA"
pws$yr16[pws$PWSID_FAC %in% yr16$PWSID_FAC[yr16$SourceCode == "GWCA"]==TRUE] <- "GWCA"
#MATCH GWA
pws$yr13[pws$PWSID_FAC %in% yr13$PWSID_FAC[yr13$SourceCode == "GWA"]==TRUE] <- "GWA"
pws$yr14[pws$PWSID_FAC %in% yr14$PWSID_FAC[yr14$SourceCode == "GWA"]==TRUE] <- "GWA"
pws$yr15[pws$PWSID_FAC %in% yr15$PWSID_FAC[yr15$SourceCode == "GWA"]==TRUE] <- "GWA"
pws$yr16[pws$PWSID_FAC %in% yr16$PWSID_FAC[yr16$SourceCode == "GWA"]==TRUE] <- "GWA"
#MATCH GUA
pws$yr13[pws$PWSID_FAC %in% yr13$PWSID_FAC[yr13$SourceCode == "GUA"]==TRUE] <- "GUA"
pws$yr14[pws$PWSID_FAC %in% yr14$PWSID_FAC[yr14$SourceCode == "GUA"]==TRUE] <- "GUA"
pws$yr15[pws$PWSID_FAC %in% yr15$PWSID_FAC[yr15$SourceCode == "GUA"]==TRUE] <- "GUA"
pws$yr16[pws$PWSID_FAC %in% yr16$PWSID_FAC[yr16$SourceCode == "GUA"]==TRUE] <- "GUA"
#MATCH SWA
pws$yr13[pws$PWSID_FAC %in% yr13$PWSID_FAC[yr13$SourceCode == "SWA"]==TRUE] <- "SWA"
pws$yr14[pws$PWSID_FAC %in% yr14$PWSID_FAC[yr14$SourceCode == "SWA"]==TRUE] <- "SWA"
pws$yr15[pws$PWSID_FAC %in% yr15$PWSID_FAC[yr15$SourceCode == "SWA"]==TRUE] <- "SWA"
pws$yr16[pws$PWSID_FAC %in% yr16$PWSID_FAC[yr16$SourceCode == "SWA"]==TRUE] <- "SWA"
#MATCH SWCA
pws$yr13[pws$PWSID_FAC %in% yr13$PWSID_FAC[yr13$SourceCode == "SWCA"]==TRUE] <- "SWCA"
pws$yr14[pws$PWSID_FAC %in% yr14$PWSID_FAC[yr14$SourceCode == "SWCA"]==TRUE] <- "SWCA"
pws$yr15[pws$PWSID_FAC %in% yr15$PWSID_FAC[yr15$SourceCode == "SWCA"]==TRUE] <- "SWCA"
pws$yr16[pws$PWSID_FAC %in% yr16$PWSID_FAC[yr16$SourceCode == "SWCA"]==TRUE] <- "SWCA"

#FLAG ROWS THAT CHANGE GWCA/SWCA TO GWA/GUA/SWA
pws$FLAG_SC_CHANGE <- 0
pws$FLAG_SC_CHANGE[pws$yr13 == "GWA" & pws$yr14 == "GWCA"|pws$yr13 == "GWA" & pws$yr15 == "GWCA"|pws$yr13 == "GWA" & pws$yr16 == "GWCA"|
					pws$yr13 == "GWCA" & pws$yr14 == "GWA"|pws$yr13 == "GWCA" & pws$yr15 == "GWA"|pws$yr13 == "GWCA" & pws$yr16 == "GWA"]<- 1
pws$FLAG_SC_CHANGE[pws$yr13 == "GUA" & pws$yr14 == "GWCA"|pws$yr13 == "GUA" & pws$yr15 == "GWCA"|pws$yr13 == "GUA" & pws$yr16 == "GWCA"|
					pws$yr13 == "GWCA" & pws$yr14 == "GUA"|pws$yr13 == "GWCA" & pws$yr15 == "GUA"|pws$yr13 == "GWCA" & pws$yr16 == "GUA"]<- 1
pws$FLAG_SC_CHANGE[pws$yr13 == "SWA" & pws$yr14 == "GWCA"|pws$yr13 == "SWA" & pws$yr15 == "GWCA"|pws$yr13 == "SWA" & pws$yr16 == "GWCA"|
					pws$yr13 == "GWCA" & pws$yr14 == "SWA"|pws$yr13 == "GWCA" & pws$yr15 == "SWA"|pws$yr13 == "GWCA" & pws$yr16 == "SWA"]<- 1
pws$FLAG_SC_CHANGE[pws$yr13 == "GWA" & pws$yr14 == "SWCA"|pws$yr13 == "GWA" & pws$yr15 == "SWCA"|pws$yr13 == "GWA" & pws$yr16 == "SWCA"|
					pws$yr13 == "SWCA" & pws$yr14 == "GWA"|pws$yr13 == "SWCA" & pws$yr15 == "GWA"|pws$yr13 == "SWCA" & pws$yr16 == "GWA"]<- 1
pws$FLAG_SC_CHANGE[pws$yr13 == "GUA" & pws$yr14 == "SWCA"|pws$yr13 == "GUA" & pws$yr15 == "SWCA"|pws$yr13 == "GUA" & pws$yr16 == "SWCA"|
					pws$yr13 == "SWCA" & pws$yr14 == "GUA"|pws$yr13 == "SWCA" & pws$yr15 == "GUA"|pws$yr13 == "SWCA" & pws$yr16 == "GUA"]<- 1
pws$FLAG_SC_CHANGE[pws$yr13 == "SWA" & pws$yr14 == "SWCA"|pws$yr13 == "SWA" & pws$yr15 == "SWCA"|pws$yr13 == "SWA" & pws$yr16 == "SWCA"|
					pws$yr13 == "SWCA" & pws$yr14 == "SWA"|pws$yr13 == "SWCA" & pws$yr15 == "SWA"|pws$yr13 == "SWCA" & pws$yr16 == "SWA"]<- 1

#FLAG ROWS THAT CHANGE GWA TO SWA AND VICE VERSA				
pws$FLAG_SC_CHANGE[pws$yr13 == "GWA" & pws$yr14 == "SWA"|pws$yr13 == "GWA" & pws$yr15 == "SWA"|pws$yr13 == "GWA" & pws$yr16 == "SWA"|
					pws$yr13 == "SWA" & pws$yr14 == "GWA"|pws$yr13 == "SWA" & pws$yr15 == "GWA"|pws$yr13 == "SWA" & pws$yr16 == "GWA"]<- 1
					
#FLAG ROWS THAT CHANGE GWCA TO SWCA AND VICE VERSA				
pws$FLAG_SC_CHANGE[pws$yr13 == "GWCA" & pws$yr14 == "SWCA"|pws$yr13 == "GWCA" & pws$yr15 == "SWCA"|pws$yr13 == "GWCA" & pws$yr16 == "SWCA"|
					pws$yr13 == "SWCA" & pws$yr14 == "GWCA"|pws$yr13 == "SWCA" & pws$yr15 == "GWCA"|pws$yr13 == "SWCA" & pws$yr16 == "GWCA"]<- 2

#write a csv to manually QA that the source changes were flagged correctly
write.csv(pws, "SourceChanges_062717.csv", row.names=FALSE)

#get counts of unique PWSs with a source change, one list for certain changes and one list for those changing from GWCA to SWCA or vice versa
pws.ids.1 <- length(unique(as.character(lapply(strsplit(as.character(pws$PWSID_FAC[pws$FLAG_SC_CHANGE==1]), split="_"),tail, n=1))))
pws.ids.2 <- length(unique(as.character(lapply(strsplit(as.character(pws$PWSID_FAC[pws$FLAG_SC_CHANGE==2]), split="_"),tail, n=1))))

					
					
					
					