

## Define input and output file paths - REVISE to reflect user's working directory
in_path <- "C:/Users/51235/ICF/WOTUS Reconsideration (ICF Internal Site) - Distributional Impacts/Inputs/"
out_path <- "C:/Users/51235/ICF/WOTUS Reconsideration (ICF Internal Site) - Distributional Impacts/Outputs/"
date <- "10082021"


# Input 404 project data, filtered to fields of interest
filtered404_input <- read.csv(paste0(in_path,"404data_filtered_input.csv"))
#create new column to pull the first worktype listed:
# install.packages(stringi)
library(stringi)
# install.packages("stringr")
library(stringr)
library(dplyr)
library(data.table)
library(reshape2)

## Creat unique ID ############
filtered404_input$PERMIT_ID <- paste0(filtered404_input$ï..ACTION_FOLDER_ID,"_",filtered404_input$ACTION_ID)

########## Clean worktype ########################

#The back slash (\) is not showing up as a character in the string. There are also some other odd special characters in the worktype field. Code below creates a new clean field, devoid of special characters, with words separated by blank space:

filtered404_input$WORKTYPE_clean <- gsub("[[:punct:]]", " ", filtered404_input$WORKTYPE)

#create new columns representing the primary and secondary worktype:
filtered404_input$character1 <- sapply(strsplit(as.character(filtered404_input$WORKTYPE), "\\\\"), `[`, 2)
filtered404_input$character2 <- sapply(strsplit(as.character(filtered404_input$WORKTYPE), "\\\\"), `[`, 3)
filtered404_input$character2 <- gsub(",","",filtered404_input$character2)
#remove blank spaces:
trim <- function (x) gsub("^\\s+|\\s+$", "", x)

filtered404_input$character1 <- str_trim(filtered404_input$character1)
filtered404_input$character2 <- str_trim(filtered404_input$character2)


filtered404_input$WORKTYPE_clean <- str_trim(filtered404_input$WORKTYPE_clean) #removes leading and trailing whitespace

#convert to all lowercase
filtered404_input$WORKTYPE_clean <- tolower(filtered404_input$WORKTYPE_clean)
filtered404_input$PROJECT_NAME_clean <- tolower(filtered404_input$PROJECT_NAME)
filtered404_input$PROJECT_DESCRIPTION_clean <- tolower(filtered404_input$PROJECT_DESCRIPTION)


#### Create working df####
filtered404 <- filtered404_input

#### Merge with final ORM data processing output
final_ORM_output <- read.csv(paste0(orm_qa_path,"ORM_Dataset_Final_MirandaSubset.csv"))
final_ORM_output <- final_ORM_output[,c(1,3)]
#filtered404 <- merge(final_ORM_output,filtered404,all.x=TRUE)
final_permit_id <- unique(final_ORM_output$PERMIT_ID)
filtered404 <- filtered404[filtered404$PERMIT_ID %in% final_permit_id,]
filtered404 <- merge(filtered404,final_ORM_output)

######## Identify public projects ##############
filtered404$w_id_1 <- ifelse(grepl("city of|town of|county of|state|parish|township|municipality|municipal|department of|dept. of| dot |dot|usdot|usda|dept of|dpw|village of|village|caltrans|nysdot|division of|dnr|odnr|md sha|kdfwr|agfc|ag&fc|ahtd|nceep|achd|adcnr",filtered404$PROJECT_NAME_clean),"Public",NA)

##Indian Tribe or State 404 Program
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & grepl("indian tribe or state 404 program", filtered404$WORKTYPE_clean), "TSP", filtered404$w_id_1)


############### DEVELOPMENT ####################
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & filtered404$character1 == "DEVELOPMENT", "D", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & filtered404$character1 == "TRANSPORTATION" & grepl("development",filtered404$WORKTYPE_clean), "D", filtered404$w_id_1)
filtered404$D_C <- ifelse(filtered404$w_id_1 == "D" & grepl("commercial",filtered404$WORKTYPE_clean),"C","") #commercial
filtered404$D_I <- ifelse(filtered404$w_id_1 == "D" & grepl("industrial",filtered404$WORKTYPE_clean),"I","") #industrial
filtered404$D_R <- ifelse(filtered404$w_id_1 == "D" & grepl("residential",filtered404$WORKTYPE_clean),"R","") #residential
filtered404$D_AI <- ifelse(filtered404$w_id_1 == "D" & grepl("associated infrastructure",filtered404$WORKTYPE_clean),"AI","") #associated infrastructure
filtered404$D_Rec <- ifelse(filtered404$w_id_1 == "D" & grepl("recreational",filtered404$WORKTYPE_clean),"Rec","") #recreational
filtered404$D_M <- ifelse(filtered404$w_id_1 == "D" & grepl("multi family|multi  family",filtered404$WORKTYPE_clean),"M","") #multi-family
filtered404$D_S <- ifelse(filtered404$w_id_1 == "D" & (grepl("single family|single  family",filtered404$WORKTYPE_clean)|grepl("single-family",filtered404$WORKTYPE_clean)),"S","") #single family
filtered404$C_I_R_count <- ifelse(filtered404$D_C == "C",1,0)+ifelse(filtered404$D_I == "I",1,0)+ifelse(filtered404$D_R == "R",1,0)
filtered404$res_use <- ifelse(filtered404$D_M == "M" & filtered404$D_S == "S","X",
                              ifelse(filtered404$D_M == "" & filtered404$D_S == "", "UK",paste0(filtered404$D_M,filtered404$D_S)))
filtered404$D_final <- ifelse(filtered404$w_id_1 == "D" & filtered404$D_C == "" & filtered404$D_I == "" & filtered404$D_R == "" & filtered404$D_AI == "AI" & 
                                filtered404$D_Rec == "","AI",
                              ifelse(filtered404$w_id_1 == "D" & filtered404$D_R == "" & filtered404$D_Rec == "Rec","C",
                              ifelse(filtered404$w_id_1 == "D" & filtered404$D_R == "R" & filtered404$D_Rec == "Rec", paste0("R-",filtered404$res_use),
                              ifelse(filtered404$w_id_1 == "D" & filtered404$C_I_R_count > 1, "X",
                              ifelse(filtered404$w_id_1 == "D" & filtered404$D_C == "C","C",
                              ifelse(filtered404$w_id_1 == "D" & filtered404$D_I == "I","I",
                              ifelse(filtered404$w_id_1 == "D" & filtered404$D_R == "R",paste0("R-",filtered404$res_use),
                              ifelse(filtered404$w_id_1 == "D" & filtered404$D_Rec == "Rec","Rec",
                              ifelse(filtered404$w_id_1 == "D", "UK",NA)))))))))
filtered404$w_id_1 <- ifelse(filtered404$w_id_1 == "D" & !is.na(filtered404$D_final),paste0(filtered404$w_id_1,"-",filtered404$D_final),filtered404$w_id_1)


#### When Industrial or Commercial or AI development is listed with Agriculture, Logging, Mining and Drilling,
#### Energy Production (including generation, pipelines, transmission/distribution), Airport, Marina, or Landfill
#### then "D-I" or "D-C" or "D-AI" are considered secondary

## AGRICULTURE
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("agriculture",filtered404$WORKTYPE_clean), "Ag", filtered404$w_id_1)

## LOGGING
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("logging",filtered404$WORKTYPE_clean), "Logging", filtered404$w_id_1)

## MINING AND DRILLING
filtered404$mining <- str_locate(filtered404$WORKTYPE_clean,"mining and drilling   mining")[,1]
filtered404$mining <- ifelse(is.na(filtered404$mining),0,filtered404$mining)
filtered404$drilling <- str_locate(filtered404$WORKTYPE_clean,"mining and drilling   drilling")[,1]
filtered404$drilling <- ifelse(is.na(filtered404$drilling),0,filtered404$drilling)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("mining and drilling   mining",filtered404$WORKTYPE_clean) & grepl("mining and drilling   drilling",filtered404$WORKTYPE_clean), 
                             ifelse(filtered404$mining < filtered404$drilling,"MD-M", "MD-D"),filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("mining and drilling   mining",filtered404$WORKTYPE_clean), "MD-M", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("mining and drilling   drilling",filtered404$WORKTYPE_clean), "MD-D", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("mining and drilling",filtered404$WORKTYPE_clean), "MD-UK", filtered404$w_id_1)


## ENERGY PRODUCTION (generation, pipeline, transmission/distribution)
#filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I") & (grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$WORKTYPE_clean)|grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$PROJECT_NAME_clean)|grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$PROJECT_DESCRIPTION_clean)), "EG", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & (grepl("energy generation",filtered404$WORKTYPE_clean)), "EG", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & (grepl("pipeline",filtered404$WORKTYPE_clean)|grepl("pipeline",filtered404$PROJECT_NAME_clean)|grepl("pipeline",filtered404$PROJECT_DESCRIPTION_clean)), "PL", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & (grepl("transmission",filtered404$WORKTYPE_clean)|grepl("transmission",filtered404$PROJECT_NAME_clean)|grepl("transmission",filtered404$PROJECT_DESCRIPTION_clean)), "Trans", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & grepl("distribution",filtered404$WORKTYPE_clean), "Dist", filtered404$w_id_1)

# Airport
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & (grepl("airport",filtered404$WORKTYPE_clean)|grepl("air|airport",filtered404$PROJECT_NAME_clean)|grepl("air|airport",filtered404$PROJECT_DESCRIPTION_clean)), "Airport", filtered404$w_id_1)

# Marina
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & (grepl("yacht club|marina",filtered404$WORKTYPE_clean)|grepl("yacht club|marina",filtered404$PROJECT_NAME_clean)|grepl("yacht club|marina",filtered404$PROJECT_DESCRIPTION_clean)), "Marina", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & ((grepl("boat|dock",filtered404$WORKTYPE_clean)&grepl("non-residential",filtered404$WORKTYPE_clean))|(grepl("boat|dock",filtered404$PROJECT_NAME_clean)&grepl("non-residential",filtered404$PROJECT_NAME_clean))|(grepl("boat|dock",filtered404$PROJECT_DESCRIPTION_clean)&grepl("non-residential",filtered404$PROJECT_DESCRIPTION_clean))),"Marina",filtered404$w_id_1)
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & ((grepl("boat|dock",filtered404$WORKTYPE_clean)&!grepl("residential",filtered404$WORKTYPE_clean))|(grepl("boat|dock",filtered404$PROJECT_NAME_clean)&!grepl("residential",filtered404$PROJECT_NAME_clean))|(grepl("boat|dock",filtered404$PROJECT_DESCRIPTION_clean)&!grepl("residential",filtered404$PROJECT_DESCRIPTION_clean))),"Marina",filtered404$w_id_1)

# Landfill
filtered404$w_id_1 <- ifelse((filtered404$w_id_1 == "D-C" | filtered404$w_id_1 == "D-I" | filtered404$w_id_1 == "D-AI") & (grepl("landfill",filtered404$WORKTYPE_clean)|grepl("landfill",filtered404$PROJECT_NAME_clean)|grepl("landfill",filtered404$PROJECT_DESCRIPTION_clean)),"Landfill",filtered404$w_id_1)

##### Remaining categories assigned based on 1st or 2nd work type listed, or project name/description #######################

## AGRICULTURE
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & filtered404$character1 == "AGRICULTURE", "Ag", filtered404$w_id_1)

## AQUACULTURE
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & filtered404$character1 == "AQUACULTURE",
                             ifelse(grepl("plant", filtered404$WORKTYPE_clean), "Aquaculture-Plants", 
                             ifelse(grepl("finfish", filtered404$WORKTYPE_clean), "Aquaculture-Finfish", 
                             ifelse(grepl("shellfish", filtered404$WORKTYPE_clean), "Aquaculture-Shellfish", "Aquaculture-X"))),filtered404$w_id_1)

## Energy generation
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & filtered404$character1 == "ENERGY GENERATION", "EG", filtered404$w_id_1)
#filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$WORKTYPE_clean)|grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$PROJECT_NAME_clean)|grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$PROJECT_DESCRIPTION_clean)), "EG", filtered404$w_id_1)

## Mining and Drilling
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & grepl("mining and drilling   mining",filtered404$WORKTYPE_clean) & grepl("mining and drilling   drilling",filtered404$WORKTYPE_clean), 
                             ifelse(filtered404$mining < filtered404$drilling,"MD-M", "MD-D"),filtered404$w_id_1)
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & grepl("mining and drilling   mining",filtered404$WORKTYPE_clean), "MD-M", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & grepl("mining and drilling   drilling",filtered404$WORKTYPE_clean), "MD-D", filtered404$w_id_1)
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & grepl("mining and drilling",filtered404$WORKTYPE_clean), "MD-UK", filtered404$w_id_1)

## Transmission
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (grepl("transmission",filtered404$WORKTYPE_clean) | grepl("transmission",filtered404$PROJECT_NAME_clean)),"Trans",filtered404$w_id_1)

## Rail
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & ((filtered404$character1 == "TRANSPORTATION" & filtered404$character2 == "RAIL") | grepl("rail|railway| rr|rr ",filtered404$PROJECT_NAME_clean)), "Rail", filtered404$w_id_1)

## Cleanup of hazardous or toxic waste
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & grepl("cleanup hazardous or toxic wastes",filtered404$WORKTYPE_clean), "CHTW", filtered404$w_id_1)

## Pipeline
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & ((filtered404$character1 == "TRANSPORTATION" & filtered404$character2 == "PIPELINE")|grepl("pipeline",filtered404$WORKTYPE_clean)|grepl("pipeline",filtered404$PROJECT_NAME_clean)|grepl("pipeline",filtered404$PROJECT_DESCRIPTION_clean)), "PL", filtered404$w_id_1)

## Utility
#filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (filtered404$character2 == "UTILITY" | filtered404$character2 == "UTILITY LINE OR STRUCTURE"| filtered404$character2 == "UTILITY LINE OR STRUCTURE,"), "UL", filtered404$w_id_1)
filtered404$utility <- ifelse(grepl("utility",filtered404$WORKTYPE_clean), "UL", NA)
filtered404$w_id_1 <- ifelse(!is.na(filtered404$utility) & filtered404$w_id_1 == "EG","UL-EG",
                             ifelse(!is.na(filtered404$utility) & filtered404$w_id_1 == "Trans","UL-TD",
                             ifelse(!is.na(filtered404$utility) & filtered404$w_id_1 == "PL","UL-PL",
                             ifelse(is.na(filtered404$w_id_1) & !is.na(filtered404$utility) & (grepl("water|wwtp|interceptor",filtered404$WORKTYPE_clean)|grepl("water|wwtp|interceptor",filtered404$PROJECT_DESCRIPTION_clean)|grepl("water|wwtp|interceptor",filtered404$PROJECT_NAME_clean)),"UL-W",
                             ifelse(is.na(filtered404$w_id_1) & !is.na(filtered404$utility) & (grepl("sewage|sewer",filtered404$WORKTYPE_clean)|grepl("sewage|sewer",filtered404$PROJECT_DESCRIPTION_clean)|grepl("sewage|sewer",filtered404$PROJECT_NAME_clean)),"UL-S",
                             ifelse(is.na(filtered404$w_id_1) & !is.na(filtered404$utility) & (grepl("gas|pipeline",filtered404$WORKTYPE_clean)|grepl("gas|pipeline",filtered404$PROJECT_DESCRIPTION_clean)|grepl("gas|pipeline",filtered404$PROJECT_NAME_clean)),"UL-NG",
                             ifelse(is.na(filtered404$w_id_1) & !is.na(filtered404$utility) & (grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$WORKTYPE_clean)|grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$PROJECT_NAME_clean)|grepl("energy generation|power|electric|generation|substation|energy|kv|entergy|national grid",filtered404$PROJECT_DESCRIPTION_clean)),"UL-EG",
                             ifelse(is.na(filtered404$w_id_1) & !is.na(filtered404$utility),"UL-Unknown",filtered404$w_id_1))))))))

## Yacht club/marina
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (grepl("yacht club|marina",filtered404$WORKTYPE_clean)|grepl("yacht club|marina",filtered404$PROJECT_NAME_clean)|grepl("yacht club|marina",filtered404$PROJECT_DESCRIPTION_clean)),"Marina",filtered404$w_id_1)
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & ((grepl("boat|dock",filtered404$WORKTYPE_clean)&grepl("non-residential",filtered404$WORKTYPE_clean))|(grepl("boat|dock",filtered404$PROJECT_NAME_clean)&grepl("non-residential",filtered404$PROJECT_NAME_clean))|(grepl("boat|dock",filtered404$PROJECT_DESCRIPTION_clean)&grepl("non-residential",filtered404$PROJECT_DESCRIPTION_clean))),"Marina",filtered404$w_id_1)
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & ((grepl("boat|dock",filtered404$WORKTYPE_clean)&!grepl("residential",filtered404$WORKTYPE_clean))|(grepl("boat|dock",filtered404$PROJECT_NAME_clean)&!grepl("residential",filtered404$PROJECT_NAME_clean))|(grepl("boat|dock",filtered404$PROJECT_DESCRIPTION_clean)&!grepl("residential",filtered404$PROJECT_DESCRIPTION_clean))),"Marina",filtered404$w_id_1)

## Airport
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (grepl("airport|air",filtered404$WORKTYPE_clean)|grepl("airport|air",filtered404$PROJECT_NAME_clean)|grepl("airport|air",filtered404$PROJECT_DESCRIPTION_clean)),"Airport",filtered404$w_id_1)

## Landfill
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (grepl("landfill",filtered404$WORKTYPE_clean)|grepl("landfill",filtered404$PROJECT_NAME_clean)|grepl("landfill",filtered404$PROJECT_DESCRIPTION_clean)),"Landfill",filtered404$w_id_1)

## Dredging
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (filtered404$character1 == "DREDGING" | grepl("bank stabilization|channelization", filtered404$WORKTYPE_clean)), "Dr", filtered404$w_id_1)

## Highway, Street, Bridge
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1) & (grepl("highway|hwy|road|rd|roads|street|st|bridge|culvert",filtered404$WORKTYPE_clean)|grepl("highway|hwy|road|rd|roads|street|st|bridge|culvert",filtered404$PROJECT_NAME_clean)|grepl("highway|hwy|road|rd|roads|street|st|bridge|culvert",filtered404$PROJECT_DESCRIPTION_clean)),"HSB",filtered404$w_id_1)

##### Assign energy source to relevant categories (generation, transmission, pipeline, distribution, mining & drilling, utility)

# Mining & drilling
filtered404$energy_source <- ifelse((filtered404$w_id_1 == "MD-M" | filtered404$w_id_1 == "MD-D"),
                                    ifelse(grepl("shale gas",filtered404$WORKTYPE_clean) | grepl("shale gas",filtered404$PROJECT_DESCRIPTION_clean) | grepl("shale gas",filtered404$PROJECT_NAME_clean),"-SHALE GAS",
                                    ifelse(grepl("oil",filtered404$WORKTYPE_clean) | grepl("oil",filtered404$PROJECT_DESCRIPTION_clean) | grepl("oil",filtered404$PROJECT_NAME_clean),"-OIL",
                                    ifelse(grepl("gas",filtered404$WORKTYPE_clean) | grepl("gas",filtered404$PROJECT_DESCRIPTION_clean) | grepl("gas",filtered404$PROJECT_NAME_clean),"-GAS",
                                    ifelse(grepl("coal",filtered404$WORKTYPE_clean) | grepl("coal",filtered404$PROJECT_DESCRIPTION_clean) | grepl("coal",filtered404$PROJECT_NAME_clean),"-COAL",
                                    ifelse(grepl("gravel",filtered404$WORKTYPE_clean) | grepl("gravel",filtered404$PROJECT_DESCRIPTION_clean) | grepl("gravel",filtered404$PROJECT_NAME_clean),"-GRAVEL",
                                    ifelse(grepl("peat",filtered404$WORKTYPE_clean) | grepl("peat",filtered404$PROJECT_DESCRIPTION_clean) | grepl("peat",filtered404$PROJECT_NAME_clean),"-PEAT",
                                    ifelse(grepl("phosphate",filtered404$WORKTYPE_clean) | grepl("phosphate",filtered404$PROJECT_DESCRIPTION_clean) | grepl("phosphate",filtered404$PROJECT_NAME_clean),"-PHOSPHATE",
                                    ifelse(grepl("rock",filtered404$WORKTYPE_clean) | grepl("rock",filtered404$PROJECT_DESCRIPTION_clean) | grepl("rock",filtered404$PROJECT_NAME_clean),"-ROCK",
                                    ifelse(grepl("sand",filtered404$WORKTYPE_clean) | grepl("sand",filtered404$PROJECT_DESCRIPTION_clean) | grepl("sand",filtered404$PROJECT_NAME_clean),"-SAND",
                                    ifelse(grepl("other mineral",filtered404$WORKTYPE_clean) | grepl("other mineral",filtered404$PROJECT_DESCRIPTION_clean) | grepl("other mineral",filtered404$PROJECT_NAME_clean),"-OTHER MINERAL","-UK")))))))))),NA)

# Energy generation
filtered404$energy_source <- ifelse(!is.na(filtered404$energy_source),filtered404$energy_source,
                                    ifelse(filtered404$w_id_1 == "EG" | filtered404$w_id_1 == "UL-EG",
                                    ifelse(grepl("coal",filtered404$WORKTYPE_clean) | grepl("coal",filtered404$PROJECT_DESCRIPTION_clean) | grepl("coal",filtered404$PROJECT_NAME_clean),"-COAL",
                                    ifelse(grepl("cogen",filtered404$WORKTYPE_clean) | grepl("cogen",filtered404$PROJECT_DESCRIPTION_clean) | grepl("cogen",filtered404$PROJECT_NAME_clean),"-COGEN",
                                    ifelse(grepl("geothermal",filtered404$WORKTYPE_clean) | grepl("geothermal",filtered404$PROJECT_DESCRIPTION_clean) | grepl("geothermal",filtered404$PROJECT_NAME_clean),"-GEOTHERMAL",
                                    ifelse(grepl("hydropower",filtered404$WORKTYPE_clean) | grepl("hydropower",filtered404$PROJECT_DESCRIPTION_clean) | grepl("hydropower",filtered404$PROJECT_NAME_clean),"-HYDROPOWER",
                                    ifelse(grepl("natural gas",filtered404$WORKTYPE_clean) | grepl("natural gas",filtered404$PROJECT_DESCRIPTION_clean) | grepl("natural gas",filtered404$PROJECT_NAME_clean),"-NATURAL GAS",
                                    ifelse(grepl("nuclear",filtered404$WORKTYPE_clean) | grepl("nuclear",filtered404$PROJECT_DESCRIPTION_clean) | grepl("nuclear",filtered404$PROJECT_NAME_clean),"-NUCLEAR",
                                    ifelse(grepl("oil",filtered404$WORKTYPE_clean) | grepl("oil",filtered404$PROJECT_DESCRIPTION_clean) | grepl("oil",filtered404$PROJECT_NAME_clean),"-OIL",
                                    ifelse(grepl("solar",filtered404$WORKTYPE_clean) | grepl("solar",filtered404$PROJECT_DESCRIPTION_clean) | grepl("solar",filtered404$PROJECT_NAME_clean),"-SOLAR",
                                    ifelse(grepl("wind",filtered404$WORKTYPE_clean) | grepl("wind",filtered404$PROJECT_DESCRIPTION_clean) | grepl("wind",filtered404$PROJECT_NAME_clean),"-WIND","-UK"))))))))),NA))

# Pipeline
filtered404$energy_source <- ifelse(!is.na(filtered404$energy_source),filtered404$energy_source,
                                    ifelse(filtered404$w_id_1 == "PL" | filtered404$w_id_1 == "UL-PL",
                                    ifelse((grepl("gas",filtered404$WORKTYPE_clean) | grepl("gas",filtered404$PROJECT_DESCRIPTION_clean) | grepl("gas",filtered404$PROJECT_NAME_clean)) & 
                                             (grepl("oil",filtered404$WORKTYPE_clean) | grepl("oil",filtered404$PROJECT_DESCRIPTION_clean) | grepl("oil",filtered404$PROJECT_NAME_clean)),"-X",
                                    ifelse(grepl("gas",filtered404$WORKTYPE_clean) | grepl("gas",filtered404$PROJECT_DESCRIPTION_clean) | grepl("gas",filtered404$PROJECT_NAME_clean),"-NG",
                                    ifelse(grepl("oil",filtered404$WORKTYPE_clean) | grepl("oil",filtered404$PROJECT_DESCRIPTION_clean) | grepl("oil",filtered404$PROJECT_NAME_clean),"-OIL","-UK"))),NA))

# Append energy source to w_id_1
filtered404$w_id_1 <- ifelse(!is.na(filtered404$energy_source),paste0(filtered404$w_id_1,filtered404$energy_source),filtered404$w_id_1)

# Change NA to "Unknown"
filtered404$w_id_1 <- ifelse(is.na(filtered404$w_id_1),"Unknown",filtered404$w_id_1)

###Merge with NAICS#######################################
w_id_naics_broad <- read.csv(paste0(in_path,"w_id_broad_naics_xwalk.csv"))
names(w_id_naics_broad) <- c("w_id_1","NAICS","NAICS Industry Description")
filtered404_naics <- merge(filtered404,w_id_naics_broad)



#### SUMMARIZE RESULTS ##################################
proj_by_sector <- data.frame(table(filtered404$w_id_1))

# Permits by sector
proj_by_naics <- aggregate(filtered404_naics$PERMIT_ID,by = list(filtered404_naics$NAICS,filtered404_naics$`NAICS Industry Description`),FUN=length)
names(proj_by_naics) <- c("NAICS","NAICS Industry Description","Number of permits")
proj_by_naics <- arrange(proj_by_naics,NAICS)
write.csv(proj_by_naics,paste0(out_path,"proj_by_naics.csv"),row.names=FALSE)

# Permits by sector and state
proj_by_naics_state <- aggregate(filtered404_naics$PERMIT_ID,by = list(filtered404_naics$STATE_FINAL,filtered404_naics$NAICS,filtered404_naics$`NAICS Industry Description`),FUN=length)
names(proj_by_naics_state) <- c("State","NAICS","NAICS Industry Description","Number of permits")

# Permits by water type and state
proj_by_water_state <- aggregate(filtered404_naics$PERMIT_ID,by = list(filtered404_naics$STATE_FINAL,filtered404_naics$COWARDIN_CATEGORY),FUN=length)
names(proj_by_water_state) <- c("State","Water_Type","Number of permits")

# Percent of projects by state, water type, sector
proj_by_state_water_sector <- aggregate(filtered404_naics$PERMIT_ID,by = list(filtered404_naics$STATE_FINAL,filtered404_naics$COWARDIN_CATEGORY,filtered404_naics$NAICS,filtered404_naics$`NAICS Industry Description`), FUN = length)
names(proj_by_state_water_sector) <- c("State","Water_Type","NAICS","NAICS Industry Description","Count")
proj_by_state_water_sector <- merge(proj_by_state_water_sector,proj_by_water_state,all.x=TRUE)
proj_by_state_water_sector$perc_of_permits <- proj_by_state_water_sector$Count / proj_by_state_water_sector$`Number of permits`
proj_by_state_water_sector$Industry <- paste0(proj_by_state_water_sector$NAICS)
proj_by_state_water_sector_summ <- proj_by_state_water_sector[,c(1,2,7,8)]
proj_by_state_water_sector_final <- dcast(proj_by_state_water_sector_summ,State+Water_Type~Industry,value.var="perc_of_permits",fun.aggregate = sum)

# Percent of projects by sector and state
proj_by_naics_state_perc <- proj_by_naics_state
proj_by_state <- aggregate(proj_by_naics_state$`Number of permits`,by=list(proj_by_naics_state$State),FUN=sum)
names(proj_by_state) <- c("State","Total state permits")
proj_by_naics_state_perc <- merge(proj_by_naics_state_perc,proj_by_state,all.x=TRUE)
proj_by_naics_state_perc$perc_of_permits <- proj_by_naics_state_perc$`Number of permits` / proj_by_naics_state_perc$`Total state permits`
proj_by_naics_state_perc$Industry <- paste0(proj_by_naics_state_perc$NAICS)
proj_by_naics_state_perc_summ <- proj_by_naics_state_perc[,c(1,6,7)]
proj_by_naics_state_perc_final <- dcast(proj_by_naics_state_perc_summ,State~Industry,value.var="perc_of_permits",fun.aggregate = sum)
proj_by_naics_state_perc_final$Water_Type <- "All water types"
proj_by_naics_state_perc_final <- proj_by_naics_state_perc_final[,c(1,24,2:23)]

# Add row with all water types to percent of projects by state, water type, sector
proj_by_state_water_sector_final <- rbind(proj_by_state_water_sector_final,proj_by_naics_state_perc_final)
proj_by_state_water_sector_final <- arrange(proj_by_state_water_sector_final,State)
write.csv(proj_by_state_water_sector_final,paste0(out_path,"proj_by_state_water_naics.csv"),row.names = FALSE)

# Percent of projects by sector and water type
proj_by_sector_water <- aggregate(filtered404_naics$PERMIT_ID,by=list(filtered404_naics$NAICS,filtered404_naics$`NAICS Industry Description`,filtered404_naics$COWARDIN_CATEGORY),FUN=length)
names(proj_by_sector_water) <- c("NAICS","NAICS_Industry_Description","Water_Type","Count")
proj_by_sector_water <- merge(proj_by_sector_water,proj_by_naics,all.x=TRUE)
proj_by_sector_water$`NAICS Industry Description` <- NULL
proj_by_sector_water$perc_of_permits <- proj_by_sector_water$Count / proj_by_sector_water$`Number of permits`
proj_by_sector_water$Count <- NULL
proj_by_sector_water$`Number of permits` <- NULL
proj_by_sector_water_final <- dcast(proj_by_sector_water,NAICS+NAICS_Industry_Description~Water_Type,value.var="perc_of_permits",fun.aggregate = sum)
write.csv(proj_by_sector_water_final,paste0(out_path,"proj_by_naics_water.csv"),row.names = FALSE)

# Costs by sector and water type
perc_costs_by_sector_water_final <- proj_by_sector_water_final





