########################################################################################
## Created by: George Gardner, ICF 
## Date Created: 10/18/22
## Date Edited: 10/27/22
## Check U.S. county settings output, household output, and SEDAC population projection
## merging. Then outputs the input files necessary for the 100-mile radius 
## county-level WTP runs.

########################################################################################
rm(list = ls(all = TRUE)) #first, clear R's workspace

library(readxl)
library(openxlsx)
library(dplyr)

setwd('C:/Users/55338/ICF/WOTUS Reconsideration (ICF Internal Site) - General/Wetland Meta-Analysis/Wetland MRM - R Code/')

# SEDAC path
SEDAC_path <- "C:/Users/55338/ICF/WOTUS Reconsideration (ICF Internal Site) - General/National Analysis/Population Projections/SEDAC/SEDAC_georeferenced_county_population_proj_excel files"

options(stringsAsFactors=FALSE)
pacman::p_load(dplyr, tidycensus, readxl, tidyverse, labelled, units)

## STEP 1: read in data.

# read in population data
pop_data <- read_xlsx("./data/WOTUS_100milebuffer_HouseholdWTP_6-15-22.xlsx")

# read in U.S. county settings data
# for 100-mile buffer
#county_data <- read_xlsx("./data/WOTUS_Buffer_WetlandAnalysis_10-12-22.xlsx")
county_data <- read_xlsx("./data/WOTUS_Buffer_WetlandAnalysis_10-27-22.xlsx")
county_data <- county_data[, c(1:3, 5:8, 10, 12, 14, 17, 20, 23, 26, 29, 33, 37, 41, 45)]

# read in SEDAC data
SEDAC_pop <- read_xlsx(paste0(SEDAC_path, "/hauer_county_totpop_SSPs.xlsx"))
names(SEDAC_pop)[names(SEDAC_pop)=="GEOID10"] <- "GEOID"

## STEP 2: convert HH income from 2019$ to 2021$ using BEA GDP deflator.
GDP <- read.csv("C:/Users/55338/ICF/WOTUS Reconsideration (ICF Internal Site) - General/Wetland Meta-Analysis/Meta-data/Metadata Data Extraction - March 2022/BEA_GDP_Deflator_Feb2022.csv")
conv <- GDP$Gross.domestic.product[32]/GDP$Gross.domestic.product[30]

county_data$`ln(Median Income)` <- log(exp(county_data$`ln(Median Income)`) * conv)

## STEP 3: check for, and potentially fill in, missing values. Remove counties 
## with no impacts or no local wetlands.

# 0 missing values when merging population data to county data
test <- left_join(county_data, pop_data) 
missing<-test[is.na(test$PersonsPerHH),]

# only missing GEOID 46102, Oglala Lakota County, South Dakota (row = 2,389).
test2 <- left_join(test, SEDAC_pop) 
missing2<-test2[is.na(test2$ssp12020),]

# fill in missing population projections using an average from surrounding counties.
# surrounding counties are GEOIDs: 31031, 46103, 31161, 46071, 46047, 46033, 31045, and 46007.
temp <- subset(test2, test2$GEOID == "31031" | test2$GEOID == "46103" | test2$GEOID == "31161" | test2$GEOID == "46071" |
                 test2$GEOID == "46047" | test2$GEOID == "46033" | test2$GEOID == "31045" | test2$GEOID == "46007")

temp <- temp[,c(41, 46, 51, 56, 61, 66)]

mean_20 <- mean(temp$ssp22020)
mean_25 <- mean(temp$ssp22025)
mean_30 <- mean(temp$ssp22030)
mean_35 <- mean(temp$ssp22035)
mean_40 <- mean(temp$ssp22040)
mean_45 <- mean(temp$ssp22045)

rm(temp)

test2[2389,41] <- mean_20
test2[2389,46] <- mean_25
test2[2389,51] <- mean_30
test2[2389,56] <- mean_35
test2[2389,61] <- mean_40
test2[2389,66] <- mean_45

# Need to remove counties where NWPR and Rapanos permanent impacts are identical. 
# From 3,067 to 3,062 obs.

test2 <- subset(test2, test2$impacts.NWPR.perm_100 !=test2$impacts.Rapanos.perm_100)

# remove GEOID 50011 since Prop.Forested_30 = NA.
test2 <- subset(test2, test2$GEOID != 50011)

# need to reassign proportion variables that are equal to 0 to an arbitrarily small number
# or proportion variables that are equal to 1 to an arbitrarily large number
# to run the benefit transfer code.
test2 <- test2 %>%
  mutate(Prop.Forested_100 = ifelse(Prop.Forested_100 == 0,0.0001,Prop.Forested_100)) %>%
  mutate(Prop.Forested_100 = ifelse(Prop.Forested_100 == 1,0.9999,Prop.Forested_100)) %>%
  mutate(Prop.Forested_30 = ifelse(Prop.Forested_30 == 0,0.0001,Prop.Forested_30)) %>%
  mutate(Prop.Forested_30 = ifelse(Prop.Forested_30 == 1,0.9999,Prop.Forested_30)) %>%
  mutate(Prop.local_100 = ifelse(Prop.local_100 == 0, 0.0001,Prop.local_100)) %>%
  mutate(Prop.local_100 = ifelse(Prop.local_100 == 1, 0.9999,Prop.local_100)) %>%
  mutate(NWPR.perm.Prop.local_100 = ifelse(NWPR.perm.Prop.local_100 == 0, 0.0001, NWPR.perm.Prop.local_100)) %>%
  mutate(NWPR.perm.Prop.local_100 = ifelse(NWPR.perm.Prop.local_100 == 1, 0.9999, NWPR.perm.Prop.local_100)) %>%
  mutate(Rapanos.perm.Prop.local_100 = ifelse(Rapanos.perm.Prop.local_100 == 0, 0.0001, Rapanos.perm.Prop.local_100)) %>%
  mutate(Rapanos.perm.Prop.local_100 = ifelse(Rapanos.perm.Prop.local_100 == 1, 0.9999, Rapanos.perm.Prop.local_100))

## STEP 4: output U.S. county data and SEDAC projections

# output the U.S. county data once more.
county_data2 <- test2[, c(4:19,1,2)]

#save(county_data2, file = "./data/county_data_100m_10-18-22.rda")
#openxlsx::write.xlsx(county_data2, file = './data/county_data_100m_10-18-22.xlsx')
save(county_data2, file = "./data/county_data_100m_10-27-22.rda")
openxlsx::write.xlsx(county_data2, file = './data/county_data_100m_10-27-22.xlsx')

# output SEDAC projections
# Create the matrix for HH data projections using SEDAC SSP2.

HHpop <- matrix(0,nrow(test2),ncol=26) # later remove first and last 3 columns. only added for extrap.
colnames(HHpop) <- c("2020", "2021","2022","2023", "2024","2025","2026","2027","2028", "2029","2030","2031","2032","2033","2034","2035","2036",
                     "2037", "2038","2039","2040","2041","2042","2043","2044","2045")

HHpop[,1] <- test2$ssp22020 # extract SSP2 in year 2020
HHpop[,6] <- test2$ssp22025 # extract SSP2 in year 2025
HHpop[,11] <- test2$ssp22030 # extract SSP2 in year 2030
HHpop[,16] <- test2$ssp22035 # extract SSP2 in year 2035
HHpop[,21] <- test2$ssp22040 # extract SSP2 in year 2040
HHpop[,26] <- test2$ssp22045 # extract SSP2 in year 2045

# need to fill in blanks using linear extrapolation. Take the difference between the
# years and divide by 5 (since estimates are every 5 years). Then add this for each year.

diff_20_25 <- (test2$ssp22025 - test2$ssp22020)/5
diff_25_30 <- (test2$ssp22030 - test2$ssp22025)/5
diff_30_35 <- (test2$ssp22035 - test2$ssp22030)/5
diff_35_40 <- (test2$ssp22040 - test2$ssp22035)/5
diff_40_45 <- (test2$ssp22045 - test2$ssp22040)/5

# start to fill in matrix.
for (i in 1:4) {
  HHpop[,i+1] <- test2$ssp22020 + diff_20_25*i
}

for (i in 1:4) {
  HHpop[,i+6] <- test2$ssp22025 + diff_25_30*i
}

for (i in 1:4) {
  HHpop[,i+11] <- test2$ssp22030 + diff_30_35*i
}

for (i in 1:4) {
  HHpop[,i+16] <- test2$ssp22035 + diff_35_40*i
}

for (i in 1:4) {
  HHpop[,i+21] <- test2$ssp22040 + diff_40_45*i
}

HHpop <- HHpop[, c(4:23)]

# now divide by the number of persons per household to get HH projections
HHpop <- HHpop/test2$PersonsPerHH
# lastly, for counties which were split into two parts, split the HH projections
# in half.
temp <- test2[, c(1,2)]
temp$TAG <- 0
temp$TAG <- ifelse(temp$GEOID == "04001" | temp$GEOID == "04005" | temp$GEOID == "04013" | temp$GEOID == "04015" |
                     temp$GEOID == "04017" | temp$GEOID == "06027" | temp$GEOID == "06037" | temp$GEOID == "06071" |
                     temp$GEOID == "32007" | temp$GEOID == "32023" | temp$GEOID == "32031", 1, 0)

HHpop <- cbind(HHpop, temp)

HHpop$`2023` <- ifelse(HHpop$TAG == 1, HHpop$`2023`/2,HHpop$`2023`)
HHpop$`2024` <- ifelse(HHpop$TAG == 1, HHpop$`2024`/2,HHpop$`2024`)
HHpop$`2025` <- ifelse(HHpop$TAG == 1, HHpop$`2025`/2,HHpop$`2025`)
HHpop$`2026` <- ifelse(HHpop$TAG == 1, HHpop$`2026`/2,HHpop$`2026`)
HHpop$`2027` <- ifelse(HHpop$TAG == 1, HHpop$`2027`/2,HHpop$`2027`)
HHpop$`2028` <- ifelse(HHpop$TAG == 1, HHpop$`2028`/2,HHpop$`2028`)
HHpop$`2029` <- ifelse(HHpop$TAG == 1, HHpop$`2029`/2,HHpop$`2029`)
HHpop$`2030` <- ifelse(HHpop$TAG == 1, HHpop$`2030`/2,HHpop$`2030`)
HHpop$`2031` <- ifelse(HHpop$TAG == 1, HHpop$`2031`/2,HHpop$`2031`)
HHpop$`2032` <- ifelse(HHpop$TAG == 1, HHpop$`2032`/2,HHpop$`2032`)
HHpop$`2033` <- ifelse(HHpop$TAG == 1, HHpop$`2033`/2,HHpop$`2033`)
HHpop$`2034` <- ifelse(HHpop$TAG == 1, HHpop$`2034`/2,HHpop$`2034`)
HHpop$`2035` <- ifelse(HHpop$TAG == 1, HHpop$`2035`/2,HHpop$`2035`)
HHpop$`2036` <- ifelse(HHpop$TAG == 1, HHpop$`2036`/2,HHpop$`2036`)
HHpop$`2037` <- ifelse(HHpop$TAG == 1, HHpop$`2037`/2,HHpop$`2037`)
HHpop$`2038` <- ifelse(HHpop$TAG == 1, HHpop$`2038`/2,HHpop$`2038`)
HHpop$`2039` <- ifelse(HHpop$TAG == 1, HHpop$`2039`/2,HHpop$`2039`)
HHpop$`2040` <- ifelse(HHpop$TAG == 1, HHpop$`2040`/2,HHpop$`2040`)
HHpop$`2041` <- ifelse(HHpop$TAG == 1, HHpop$`2041`/2,HHpop$`2041`)
HHpop$`2042` <- ifelse(HHpop$TAG == 1, HHpop$`2042`/2,HHpop$`2042`)

rm(temp)
# to include for table building in reports
county_names <- test2[,c(1, 20, 27)]

names(county_names)[names(county_names)=="NAMELSAD10"] <- "County"

#openxlsx::write.xlsx(county_names, file = './data/county_names_100m_10-18-22.xlsx')
#save(county_names, file = "./data/county_names_100m_10-18-22.rda")
openxlsx::write.xlsx(county_names, file = './data/county_names_100m_10-27-22.xlsx')
save(county_names, file = "./data/county_names_100m_10-27-22.rda")

# HH projections
HHpop <- HHpop[,c(1:20)]
#openxlsx::write.xlsx(HHpop, file = './data/county_HHprojections_100m_10-18-22.xlsx')
#save(HHpop, file = "./data/county_HHprojections_100m_10-18-22.rda")
openxlsx::write.xlsx(HHpop, file = './data/county_HHprojections_100m_10-27-22.xlsx')
save(HHpop, file = "./data/county_HHprojections_100m_10-27-22.rda")
