#########################################################################
#												                                                #
#				          DO WQS Geospatial Variable Calcs                      #
#												                                                #
#########################################################################

# Author: Alyssa Le, George Gardner, Jessica Balukas
# Date Created: 03/13/23
# Last Updated: 07/28/23

# The purpose of this code is to calculate the variables required for the DO WQS economic analysis.

rm(list=ls())
gc()

options(stringsAsFactors=FALSE)
pacman::p_load(dplyr, sf, tidycensus, tidyr, purrr, magrittr, ggplot2, nhdplusTools, measurements,
               foreign, tigris, openxlsx, terra, rgdal, readr, rgeos, sp, raster)

#Increase memory limit
memory.limit(size=100000)

#tigris::blocks(state = c("NY", "NJ", "PA", "MD", "DE", "DC", "CT"))
# Variables to change before running the code --------------------------------

outpath <- "C:/Users/40179/ICF/CWA Anniversary & Econ Support - General/06_Technical Support for Economic Analysis/TD3_Delaware River WQS/GIS/Data/R_programs/Outputs/"
# has information on the FMA, Delaware River main stem and zones, sturgeon critical habitats.
EPA_Data <- "C:/Users/40179/ICF/CWA Anniversary & Econ Support - General/06_Technical Support for Economic Analysis/TD3_Delaware River WQS/GIS/Data/"
# has information on NHD flowlines.
GIS_Data <- "C:/Users/40179/ICF/Steam Electric ELG - Documents/6-Benefits/GIS/NHD/"
# land cover information.
NLCD_Data <- "C:/Users/40179/OneDrive - ICF/Desktop/PHMSA Shapefiles/NLCD/nlcd_2019_land_cover_l48_20210604"

# Census Data Prep
# Activate API Key
census_api_key("0d3711e1f7f03d54ea326708428a15bcec0c0621") # Key registered to Sam Ennett
years <- c(2021)
Census_var <- c("B01001_001", "B19013_001E", "B25010_001E") # Population, Median Household Income, Average Household Size 

# STEP 1: Read in data ----------------------------------------------------------------

# Fish Maintenance Area
FMA <- st_read(paste0(EPA_Data,"Fish Maintenance Area/delriv_fma.shp")) |>
  st_transform("ESRI:102008")

# NLCD 2019 data (30-m resolution)
NLCD <- raster(paste0(NLCD_Data, "./nlcd_2019_land_cover_l48_20210604.img"))

# get 5-year CBG-level data on median income, population, and average household size.
CBG_df <- get_acs(geography = "block group",
             year = years,
             survey = "acs5",
             state = c("NY", "NJ", "PA", "MD", "DE", "DC", "CT"),
             variables = Census_var,
             geometry = T) 
CBG_df <- CBG_df |>
  st_transform("ESRI:102008")

# NOTE: don't need geometry of tracts since this data is only used to supplement missing CBG ACS data.
Tract_df <- get_acs(geography = "tract",
                    survey = "acs5",
                    state = c("NY", "NJ", "PA", "MD", "DE", "DC", "CT"),
                    variables = Census_var,
                    geometry = F) %>%
                    dplyr::select(-moe) %>%
  pivot_wider(names_from = "variable", values_from = "estimate")
# change names of columns.
names(Tract_df) <- c("GEOID", "TRACT_LOC", "POPULATION", "MED_INC", "HOUSEHOLD_SIZE")

# County polygons
County_df <- get_acs(geography = "county",
                  year = years,
                  survey = "acs5",
                  state = c("NY", "NJ", "PA", "MD", "DE", "DC", "CT"),
                  variables = Census_var,
                  geometry = T) 
County_df <- County_df |>
  st_transform("ESRI:102008")

# NHD flowlines for region 2
Reg2_flowlines <- st_read(paste0(GIS_Data,"NHDPlusMA/NHDPlus02/NHDSnapshot/Hydrography/NHDFlowline.shp")) |>
  st_transform("ESRI:102008") |>
  st_zm()

# NHD VAA data
#*# Note: this data provides the stream order information.
Reg2_VAA <- read.dbf(paste0(GIS_Data,"NHDPlusMA/NHDPlus02/NHDPlusAttributes/PlusFlowlineVAA.dbf"))

# Merge NHD flowline with NHD VAA data to get stream orders.
Reg2_data <- merge(Reg2_flowlines, Reg2_VAA, by.x = "COMID", by.y = "ComID")

# STEP 2: Prepare variables ----------------------------------------------------

# Affected resource:
# obtain the NHD flowlines that lie within the FMA.
FMA_flowlines <- st_intersection(FMA, Reg2_flowlines) |>
  filter(GNIS_NAME=="Delaware River") # filtering is done to remove tributaries
FMA_flowlines$LENGTH_FMA_m = st_length(FMA_flowlines) # may have to remove [m] from output since not considered numbers

# this gives us the stream order information for COMIDs within the FMA.
FMA_flowlines_vaa <- merge(FMA_flowlines, Reg2_VAA, by.x="COMID", by.y="ComID", all.x=T)

# Market Extent (100-mile buffer around FMA)
MarkExt <- st_union(st_buffer(FMA_flowlines_vaa, conv_unit(100, "mi", "m")))

# visual confirmation of edge boundary buffer.
ggplot() +
  geom_sf(data = MarkExt, fill = "red") +
  geom_sf(data = FMA, color = "blue")

# STEP 3: Find Population, Median Household Income, Average Household Size for CBG centroids that intersect --
# with the FMA-100-mile-buffer polygon -----------------------------------------------------------------------

# First, get centroids of the CBGs
CBG_df_centroid <- CBG_df |>
  st_centroid(CBG_df) |>
  st_zm()

# Next, find the centroid of CBGs that lie within 100-miles of the FMA
int_CBG <- st_filter(CBG_df_centroid, y = MarkExt, .predicate = st_intersects)

# visual confirmation of the CBGs and FMA buffer
ggplot() +
  geom_sf(data = MarkExt, fill = "red") +
  geom_sf(data = int_CBG, color = "blue") +
  geom_sf(data = FMA_flowlines, color = "green")

# obtain 2021 ACS 5-year estimates of population, average household size, median income.
# pivot wider
CBG_df_form <- int_CBG %>%
  filter(!is.na(estimate)) %>%
  st_drop_geometry() %>%
  dplyr::select(-moe) %>%
  pivot_wider(names_from = "variable", values_from = "estimate")
# change names of columns.
names(CBG_df_form) <- c("GEOID", "BG_LOC", "POPULATION", "MED_INC", "HOUSEHOLD_SIZE")

# Some NA values for median income and household size. Replace these with values from the census tract.
# If NA values exist after that, replace these with values from the county. 

# merge CBG data to tract data
CBG_df_form <- CBG_df_form %>%
  mutate(GEOID2 = substr(GEOID, 1, 11))
CBG_df_form <- merge(CBG_df_form, Tract_df, by.x = "GEOID2", by.y = "GEOID", all.x = TRUE)

temp_county <- County_df %>%
  st_drop_geometry() %>%
  dplyr::select(-moe) %>%
  pivot_wider(names_from = "variable", values_from = "estimate")
# change names of columns.
names(temp_county) <- c("GEOID", "COUNTY_LOC", "POPULATION", "MED_INC", "HOUSEHOLD_SIZE")

# merge CBG data to county data
CBG_df_form$GEOID3 = substr(CBG_df_form$GEOID, 1, 5)
CBG_df_form <- merge(CBG_df_form, temp_county, by.x = "GEOID3", by.y = "GEOID", all.x = TRUE)

# replace NA values for median income and household size by CBG first with tract data and second with county data.
CBG_df_form <- CBG_df_form %>%
  mutate(MED_INC.x = ifelse(is.na(MED_INC.x), MED_INC.y, MED_INC.x)) %>%
  mutate(MED_INC.x = ifelse(is.na(MED_INC.x), MED_INC, MED_INC.x)) %>%
  mutate(HOUSEHOLD_SIZE.x = ifelse(is.na(HOUSEHOLD_SIZE.x), HOUSEHOLD_SIZE.y, HOUSEHOLD_SIZE.x)) %>%
  mutate(HOUSEHOLD_SIZE.x = ifelse(is.na(HOUSEHOLD_SIZE.x), HOUSEHOLD_SIZE, HOUSEHOLD_SIZE.x))

CBG_df_form <- CBG_df_form[, 3:7]
names(CBG_df_form) <- c("GEOID", "BG_LOC", "POPULATION", "MED_INC", "HOUSEHOLD_SIZE")

rm(temp_county)

# export as excel file.
write.csv(CBG_df_form, paste0(outpath,"CBG_in_FMA_2021_5Yr_ACS.csv"))

# STEP 4: Generate a list of county polygons that intersect with the FMA -------------------------------------------------
int_Counties <- st_filter(County_df, y = FMA, .predicate = st_intersects) # NOTE: 7/28/23 revised to calculate based on FMA, not MarkExt
county_list <- int_Counties %>%
  filter(!is.na(estimate)) %>%
  st_drop_geometry() %>%
  dplyr::select(-moe) %>%
  pivot_wider(names_from = "variable", values_from = "estimate")
# change names of columns.
names(county_list) <- c("GEOID", "COUNTY_LOC", "POPULATION", "MED_INC", "HOUSEHOLD_SIZE")
# export as excel file.
write.csv(county_list, paste0(outpath,"Cnty_in_FMA_2021_5Yr_ACS.csv"))

# STEP 5: Calculate ln_ar_ratio -----------------------------------------------
# ln_ar_ratio = ln(Total area of CBGs within 100 miles of the FMA / area of counties intersecting the FMA)
# calculate area of CBGs within 100 miles of the FMA.

# create temporary list of intersecting CBGs to filter CBG_df. Need to do this since
# CBG shapes were converted from polygons to centroids (we want area of polygon).
temp_int_CBG <- int_CBG %>%
  dplyr::select(GEOID) %>%
  distinct(GEOID)

# calculates areas and removes duplicates
CBG_area <- CBG_df %>%
  mutate(area_m2 = st_area(CBG_df)) %>%
  merge(temp_int_CBG, all.y = TRUE) %>%
  st_drop_geometry() %>%
  pivot_wider(names_from = "variable", values_from = "estimate") %>%
  distinct(GEOID, .keep_all = TRUE)

rm(temp_int_CBG)

CBG_area <- sum(CBG_area$area_m2)
CBG_area_km2 <- CBG_area/1000^2 # total area of intersecting CBG in km^2.
attributes(CBG_area_km2) = NULL

# calculate area of counties intersecting the FMA.
County_area <- int_Counties %>%
  mutate(area_m2 = st_area(int_Counties)) %>%
  st_drop_geometry() %>%
  pivot_wider(names_from = "variable", values_from = "estimate") %>%
  distinct(GEOID, .keep_all = TRUE)

County_area <- sum(County_area$area_m2)
County_area_km2 <- County_area/1000^2 # total area of intersecting CBG in km^2.
attributes(County_area_km2) = NULL

ln_ar_ratio <- log(CBG_area_km2/County_area_km2) # log(52235.89/68412.99) = 2.427232

# Visualize affected resource and market extent
ggplot() +
  geom_sf(data = MarkExt, color = "black", fill = "lightgray") +
  geom_sf(data = int_Counties, color = "green") +
  geom_sf(data = FMA, color = "blue") +
  geom_sf(data = FMA_flowlines, color="green") +
  geom_sf(data = FMA_flowlines_vaa |> filter(StreamOrde>5), color="red") +
  coord_sf()

# STEP 6: Calculate sub_proportion -----------------------------------------------
# Draw 100-mile buffers around each CBG centroid identified within Step 1.
# For each CBG, estimate the:
# length miles of the FMA that fall within the buffer/total length miles stream order 5 and greater within the buffer

# Create a version of CBG shapefile that won't have repeat rows of the same geometry
CBG_df2 <- get_acs(geography = "block group",
                   year = years,
                   survey = "acs5",
                   state = c("NY", "NJ", "PA", "MD", "DE", "DC", "CT"),
                   variables = "B01001_001",
                   geometry = T) 
CBG_df2 <- CBG_df2 |>
  st_transform("ESRI:102008")
# Get centroids of the CBGs
CBG_df2 <- CBG_df2 |>
  st_centroid(CBG_df2) |>
  st_zm()
# Next, find the centroid of CBGs that lie within 100-miles of the FMA
CBG_df2 <- st_filter(CBG_df2, y = MarkExt, .predicate = st_intersects)
# Next, find 100-mile buffers of CBG centroids
CBG_buff <- st_buffer(CBG_df2, conv_unit(100, "mi", "m"))

# visual check.
ggplot() +
  geom_sf(data = CBG_buff, color = "black", fill = "lightgray") +
  geom_sf(data = int_CBG, color = "green")

## second, obtain sf for the flowlines that intersect both the FMA
## and the 100-mile CBG buffers. 

# create an empty dataframe to fill in.
sub_prop_table <- data.frame('GEOID' = CBG_buff$GEOID,
                             'FMA_length_m' = 0)

# the loop will add information on the lengths of the flowlines that intersect the
# FMA and the 100-mile CBG buffers.
i <- 1 # loop through CBGs
while (i <= nrow(CBG_buff)) {
  temp_cbg <- CBG_buff[i,]
  temp_crop <- st_intersection(FMA_flowlines_vaa, temp_cbg)
  temp_crop$length_FMA_m <- st_length(temp_crop)
  sub_prop_table$FMA_length_m[i] <- sum(temp_crop$length_FMA_m)
  i <- i + 1 # increment
}
rm(temp_cbg, temp_crop)

# next, a similar loop will be constructed to add information on the lengths of the
# flowlines above stream order 4 that intersect the 100-mile buffers.
# create another sf of the flowlines that are above stream order 4
Reg2_data_G4 <- Reg2_data |>
  filter(StreamOrde>4)

#visual check
ggplot() +
  geom_sf(data = Reg2_data, color = "blue") +
  geom_sf(data = Reg2_data_G4, color = "green")

# create another column in the sub_prop_table to fill in with flowline length data.
sub_prop_table <- sub_prop_table %>%
  mutate(SOG4_length_m = 0)

i <- 1 # loop through CBGs
while (i <= nrow(CBG_buff)) {
  temp_cbg <- CBG_buff[i,]
  temp_crop <- st_intersection(Reg2_data_G4, temp_cbg)
  temp_crop$length_SOG4_m <- st_length(temp_crop)
  sub_prop_table$SOG4_length_m[i] <- sum(temp_crop$length_SOG4_m)
  i <- i + 1 # increment
}

# lastly, create a new variable in sub_prop_table that calculates sub_prop
sub_prop_table$sub_prop <- sub_prop_table$FMA_length_m/sub_prop_table$SOG4_length_m

# export as excel file.
write.csv(sub_prop_table, paste0(outpath,"Sub_Prop_Table.csv"))

# STEP 7: Calculate ln_ar_agr -----------------------------------------------

# create a new clean combined county shapefile that is not a multi-part feature
# so can extract raster values.
int_Counties2 <- get_acs(geography = "county",
                     year = years,
                     survey = "acs5",
                     state = c("NY", "NJ", "PA", "MD", "DE", "DC", "CT"),
                     variables = "B01001_001",
                     geometry = T) 
int_Counties2 <- merge(int_Counties2, county_list)
int_Counties2 <- st_transform(int_Counties2, crs=st_crs(crs(NLCD)))
int_Counties2_NLCD <- extract(NLCD,int_Counties2, df = T)

ggplot() +
  geom_sf(data = int_Counties2)

# Calculate total land coverage.
LU_tot <- int_Counties2_NLCD %>%
  summarise(LU_tot = n())

# Calculate the agriculture land coverage.
LU_Ag <- int_Counties2_NLCD %>%
  filter(NLCD.Land.Cover.Class == 81 | NLCD.Land.Cover.Class == 82) %>%
  summarise(LU_Ag = n()) 

Landuse_Ag <- merge(LU_Ag,LU_tot,all=T)
Landuse_Ag[is.na(Landuse_Ag)] = 0

# Percent coverage
LU_Perc_Cov_Ag <- Landuse_Ag %>%
  mutate(LU_Percent_Ag = LU_Ag / LU_tot) # 0.178887 = 1287764/7198770

ln_ar_agr <- log(LU_Perc_Cov_Ag$LU_Percent_Ag) # -1.721003

# alternative code for calculating ln_ar_agr.
tot_cells<-sapply(int_Counties2_NLCD,length) # 7,198,770
farm_cells<-sapply(int_Counties2_NLCD,function(x) sum(x==81 | x==82)) # 1,287,764
ln_ar_agr2 <- log(1287764/7198770) # -1.721003
