## Functions for reading EIA spreadsheets

identify_target_plants <- function(gnr_dir, plt_dir){
  
  # Identify full list of conventional HY plants to target for analysis...
  # ... using 2022 generator data (most recent complete EIA set).
  
  # read EIA data files and filter for CONUS
  suppressWarnings(
    read_xlsx(paste0(gnr_dir,
                     "f923_2022/",
                     "EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx"),
              skip = 5, progress = FALSE) |>
      filter(`AER\r\nFuel Type Code` == "HYC",
             `Plant State` != "AK",
             `Plant State` != "HI") |>
      select(EIA_ID = `Plant Id`) |> unique() |>
      mutate(EIA_ID = as.integer(EIA_ID)) ->
      HYC_with_gen_2022
  )

  # read EIA plant data files (containing nameplate) ...
  # ... and filter capacity cutoff and CONUS plants ...
  # with generation data
  suppressWarnings(
    read_xlsx(paste0(plt_dir,
                     "/eia8602022/",
                     "3_1_Generator_Y2022.xlsx"),
              skip = 1) |>
      select(EIA_ID = `Plant Code`, nameplate = `Nameplate Capacity (MW)`,
             `Prime Mover`) |>
      filter(`Prime Mover` == "HY") |>
      mutate(EIA_ID = as.integer(EIA_ID)) |>
      summarise(nameplate = sum(nameplate), .by = EIA_ID) |>
      filter(EIA_ID %in% HYC_with_gen_2022[["EIA_ID"]],
             nameplate >= capacity_cutoff_MW) ->
      HYC_cap_2022
  )
  
  # get target plant list
  HYC_cap_2022 |>
    select(EIA_ID) ->
    target_plants
  
  return(target_plants)
  
}


get_EIA_monthly_gen <- function(gnr_dir, plt_dir, target_plants){
  
  # PART 1. Get capacity for target plants

  1990:2022L |>
    map_dfr(function(yr){
      
      if(yr %in% 1990:1991){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860a",
            yr, "/GENTYPE3Y", substr(yr, 3, 4), ".xls"
          )) |>  as_tibble() |>  
            filter(`Prime Mover` == "HC")  |>
            mutate(STATUS = if_else(is.na(`Status Code...9`),
                                    `Status Code...22`,
                                    `Status Code...9`)) |> 
            filter(STATUS %in% c("OP", "SB", "SD", "TS", "SD", "OT")) |> 
            select(EIA_ID = `Plant Code`,
                   nameplate_kW = `Nameplate Capacity...7`) |> 
            summarise(nameplate = 1e-3 * sum(nameplate_kW), .by = EIA_ID) -> 
            nameplate_all
        )
      }
      
      if(yr %in% 1992:1994){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860a",
            yr, "/TYPE3", substr(yr, 3, 4), ".xls"
          )) |>  as_tibble() |>  
            filter(PRIMEMOVER == "HC")  |>
            mutate(STATUS = if_else(is.na(STATUSCODE),
                                    PRSTATUSCD,
                                    STATUSCODE)) |> 
            filter(STATUS %in% c("OP", "SB", "SD", "TS", "SD", "OT")) |> 
            select(EIA_ID = PLNTCODE ,
                   nameplate_kW = NAMEPLATE) |> 
            summarise(nameplate = 1e-3 * sum(nameplate_kW), .by = EIA_ID) -> 
            nameplate_all
        )
      }
      
      if(yr %in% 1995:1996){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860a",
            yr, "/TYPE3Y", substr(yr, 3, 4), ".xls"
          )) |>  as_tibble() |>  
            filter(PRIMEMOVER %in% c("HY", "HC"))  |> 
            mutate(STATUS = if_else(is.na(STATUSCODE),
                                    PRSTATUSCD,
                                    STATUSCODE)) |> 
            filter(STATUS %in% c("OP", "SB", "SD", "TS", "SD", "OT")) |> 
            select(EIA_ID = PLNTCODE ,
                   nameplate_kW = NAMEPLATE) |> 
            summarise(nameplate = 1e-3 * sum(nameplate_kW), .by = EIA_ID) -> 
            nameplate_all
        )
      }
      
      if(yr == 1997){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860a",
            yr, "/GENERTOR.xls"
          )) |>  as_tibble() |>  
            filter(PRIMEMOVER %in% c("HY"))  |> 
            mutate(STATUS = if_else(is.na(STATUSCODE),
                                    PRSTATUSCD,
                                    STATUSCODE)) |>
            filter(STATUS %in% c("OP", "SB", "SD", "TS", "SD", "OT")) |> 
            select(EIA_ID = PLNTCODE ,
                   nameplate_kW = NAMEPLATE) |> 
            summarise(nameplate = 1e-3 * sum(nameplate_kW), .by = EIA_ID) -> 
            nameplate_all
        )
      }
      
      if(yr == 1998){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860a",
            yr, "/ExistingGenerators", yr, ".xls"
          ), sheet = paste0(yr, " Existing Generators")) |>  as_tibble() |>  
            filter(PRIMEMOVER == "HY",
                   EXISTING_STATUS %in% c("OP", "SB", "SD", "TS", "SD", "OT")) |> 
            select(EIA_ID = PLANT_CODE ,
                   nameplate_kW = EXISTING_NAMEPLATE) |> 
            summarise(nameplate = 1e-3 * sum(nameplate_kW), .by = EIA_ID) -> 
            nameplate_all
        )
      }
      
      if(yr %in% 1999:2000){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860a",
            yr, "/ExistingGenerators", yr, ".xls"
          ), sheet = "Existing Generators") |>  as_tibble() |>  
            filter(PRIMEMOVER == "HY",
                   EXISTING_STATUS %in% c("OP", "SB", "SD", "TS", "SD", "OT")) |> 
            select(EIA_ID = PLANT_CODE ,
                   nameplate_kW = EXISTING_NAMEPLATE) |> 
            summarise(nameplate = 1e-3 * sum(nameplate_kW), .by = EIA_ID) -> 
            nameplate_all
        )
      }
      
      if(yr %in% 2001:2003){
        
        suppressWarnings(
          read.dbf(paste0(
            plt_dir, "/eia860",
            yr, "/GENY", substr(yr,3,4), ".dbf"
          )) |>
            as_tibble() |>
            filter(PRIMEMOVER == "HY",
                   STATUS %in% c("OP", "SB", "OA")) |>
            select(EIA_ID = PLNTCODE, nameplate = NAMEPLATE) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      if(yr %in% 2004:2008){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860",
            yr, "/GenY", substr(yr, 3, 4), ".xls"
          )) |>
            as_tibble() |>
            filter(PRIMEMOVER == "HY",
                   STATUS %in% c("OP", "SB", "OA")) |> 
            select(EIA_ID = PLNTCODE, nameplate = NAMEPLATE) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      if(yr %in% 2009){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860",
            yr, "/GeneratorY", substr(yr, 3, 4), ".xls"
          )) |>
            as_tibble() |>
            filter(PRIME_MOVER == "HY",
                   STATUS %in% c("OP", "SB", "OA")) |>
            select(EIA_ID = PLANT_CODE , nameplate = NAMEPLATE) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      if(yr %in% 2010){
        suppressWarnings(
          read_xls(paste0(
            plt_dir, "/eia860",
            yr, "/GeneratorsY", yr, ".xls"
          )) |>
            as_tibble() |>
            filter(PRIME_MOVER == "HY",
                   STATUS %in% c("OP", "SB", "OA")) |>
            select(EIA_ID = PLANT_CODE , nameplate = NAMEPLATE) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      if(yr %in% 2011){
        suppressWarnings(
          read_xlsx(paste0(
            plt_dir, "/eia860",
            yr, "/GeneratorY", yr, ".xlsx"
          ), skip = 1) |>
            as_tibble() |>
            filter(PRIME_MOVER == "HY",
                   STATUS %in% c("OP", "SB", "OA")) |>
            select(EIA_ID = PLANT_CODE, nameplate = NAMEPLATE) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      if(yr %in% 2012){
        suppressWarnings(
          read_xlsx(paste0(
            plt_dir, "/eia860",
            yr, "/GeneratorY", yr, ".xlsx"
          ), skip = 1) |>
            as_tibble() |>
            filter(`Prime Mover` == "HY",
                   Status %in% c("OP", "SB", "OA")) |>
            select(EIA_ID = `Plant Code`, nameplate = `Nameplate Capacity (MW)`) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      if(yr %in% 2013:2022){
        suppressWarnings(
          read_xlsx(paste0(
            plt_dir, "/eia860",
            yr, "/3_1_Generator_Y", yr, ".xlsx"
          ), skip = 1) |>
            as_tibble() |>
            filter(`Prime Mover` == "HY",
                   Status %in% c("OP", "SB", "OA")) |>
            select(EIA_ID = `Plant Code`, nameplate = `Nameplate Capacity (MW)`) |>
            summarise(nameplate = sum(nameplate), .by = EIA_ID) ->
            nameplate_all
        )
      }
      
      tibble(EIA_ID = target_plants) |>
        left_join(nameplate_all, by = join_by(EIA_ID)) |>
        mutate(year = !!yr)
    }
    ) -> plant_nameplate_MW_1990_2022
  
  # reported plant capacity pre-1990 is available only via generation data files...
  
  1970:1989L |>
    map_dfr(function(yr){
      
      suppressWarnings(
        read_xls(paste0(gnr_dir, "f759",
                        yr, "u.xls")) |>
          mutate(PCODE = as.integer(PCODE)) |>
          filter(PCODE %in% target_plants) |>
          summarise(nameplate = 1e-3 * sum(CAPACITY), .by = PCODE) |>
          # ^^ unit correction to MW
          rename(EIA_ID = PCODE) ->
          plant_nameplate
      )
      return(
        tibble(EIA_ID = target_plants) |>
          left_join(plant_nameplate, by = join_by(EIA_ID)) |>
          mutate(year = !!yr)
      )
    }) ->
    plant_nameplate_MW_1970_1989

  # combine plant capacity data to single table  
  bind_rows(
    plant_nameplate_MW_1970_1989,
    plant_nameplate_MW_1990_2022
  ) -> plant_nameplant_MW
  
  
  # interpolate gaps in capacity data (including weird 0 values)
  plant_nameplant_MW |> 
    split(~EIA_ID) |>
    map_dfr(function(x){
      
      x |> 
        pivot_wider(names_from = "year", values_from = "nameplate") ->
        x_wide
      
      # deal with odd cases where Capacity decreases...
      # ...(due mainly to PS included in pre-1990 nameplate data)
      x_wide[["1986"]] - x_wide[["1990"]] -> jump_86_90
      
      x |> 
        mutate(nameplate = if_else(
          year %in% 1970:1986 & jump_86_90 > 0,
          nameplate - jump_86_90,
          nameplate
        )) |> 
        #interpolate missing years
        mutate(
          nameplate = if_else(year %in% 1987:2000 & nameplate == 0,
                              NA_real_, nameplate),
          nameplate = na.approx(nameplate,
                                na.rm = F))
    }) -> plant_nameplate_MW_interpolated
  
  
  # PART 2. Get generation for target plants
  # start with nonutil generation, which is contained in ...
  # ... three separate files (1989->1998; 1999; 2000)
  
  # 1989 -> 1998 non util generation
  read_xls(paste0(gnr_dir, "f906nonutil1989/",
                  "1989 to 1998 Nonutility Power Producer Data.xls")) |> 
    filter(`Aggregate Fuel Type` == "HYC") |> 
    select(year = Year, EIA_ID = `Facility Code`,
           TOTAL = `Net Generation (MWh)`) |> 
    filter(EIA_ID %in% target_plants) |> 
    mutate(EIA_ID = as.integer(EIA_ID), year = as.integer(year)) |> 
    # deal with multiple values for some plants (e.g., 54134)
    summarise(TOTAL = sum(TOTAL, na.rm = T), .by = c(year, EIA_ID)) ->
    nonutil_1989_1998
  
  # 1999 and 2000 non util generation
  read_xls(paste0(gnr_dir, "f906nonutil1999/",
                  "F9061999nu.xls")) |> 
    filter(FACILITYID %in% target_plants,
           PRIMEMOVER == 1) |> 
    # if net generation not reported, compute as gross generation - consumption
    mutate(Jan = if_else(GENERATYPE == "G", JANGENERAT - JANCONSUMP, JANGENERAT),
           Feb = if_else(GENERATYPE == "G", FEBGENERAT - FEBCONSUMP, FEBGENERAT),
           Mar = if_else(GENERATYPE == "G", MARGENERAT - MARCONSUMP, MARGENERAT),
           Apr = if_else(GENERATYPE == "G", APRGENERAT - APRCONSUMP, APRGENERAT),
           May = if_else(GENERATYPE == "G", MAYGENERAT - MAYCONSUMP, MAYGENERAT),
           Jun = if_else(GENERATYPE == "G", JUNGENERAT - JUNCOMSUMP, JUNGENERAT),
           Jul = if_else(GENERATYPE == "G", JULGENERAT - JULCONSUMP, JULGENERAT),
           Aug = if_else(GENERATYPE == "G", AUGGENERAT - AUGCONSUMP, AUGGENERAT),
           Sep = if_else(GENERATYPE == "G", SEPGENERAT - SEPCONSUMP, SEPGENERAT),
           Oct = if_else(GENERATYPE == "G", OCTGENERAT - OCTCONSUMP, OCTGENERAT),
           Nov = if_else(GENERATYPE == "G", NOVGENERAT - NOVCONSUMP, NOVGENERAT),
           Dec = if_else(GENERATYPE == "G", DECGENERAT - DECCONSUMP, DECGENERAT)) |> 
    select(EIA_ID = FACILITYID, year = YEAR, one_of(month.abb)) |> 
    # remove two cases with mostly zero
    filter(!EIA_ID %in% c(2188, 2195)) |> 
    mutate(TOTAL = Jan + Feb + Mar + Apr + May + Jun +
             Jul + Aug + Sep + Oct + Nov + Dec) |> 
    mutate(EIA_ID = as.integer(EIA_ID),
           year = as.integer(year)) |> 
    # these data are in KWh. Convert to MWh:
    mutate(across(is.double, function(x) x * 1e-3)) ->
    nonutil_1999
  
  # year 2000 nonutil generation
  # 1999 non util generation
  read_xls(paste0(gnr_dir, "f906nonutil2000/",
                  "F9062000nu.xls")) |> 
    filter(FACILITYID %in% target_plants,
           PRIMEMOVER == 1) |> 
    # if net generation not reported, compute as gross generation - consumption
    mutate(Jan = if_else(GENERATYPE == "G", JANGENERAT - JANCONSUMP, JANGENERAT),
           Feb = if_else(GENERATYPE == "G", FEBGENERAT - FEBCONSUMP, FEBGENERAT),
           Mar = if_else(GENERATYPE == "G", MARGENERAT - MARCONSUMP, MARGENERAT),
           Apr = if_else(GENERATYPE == "G", APRGENERAT - APRCONSUMP, APRGENERAT),
           May = if_else(GENERATYPE == "G", MAYGENERAT - MAYCONSUMP, MAYGENERAT),
           Jun = if_else(GENERATYPE == "G", JUNGENERAT - JUNCOMSUMP, JUNGENERAT),
           Jul = if_else(GENERATYPE == "G", JULGENERAT - JULCONSUMP, JULGENERAT),
           Aug = if_else(GENERATYPE == "G", AUGGENERAT - AUGCONSUMP, AUGGENERAT),
           Sep = if_else(GENERATYPE == "G", SEPGENERAT - SEPCONSUMP, SEPGENERAT),
           Oct = if_else(GENERATYPE == "G", OCTGENERAT - OCTCONSUMP, OCTGENERAT),
           Nov = if_else(GENERATYPE == "G", NOVGENERAT - NOVCONSUMP, NOVGENERAT),
           Dec = if_else(GENERATYPE == "G", DECGENERAT - DECCONSUMP, DECGENERAT)) |> 
    select(EIA_ID = FACILITYID, year = YEAR, one_of(month.abb)) |> 
    # remove five cases with mostly zero
    filter(!EIA_ID %in% c(552, 553, 1629, 1630, 3145)) |> 
    mutate(TOTAL = Jan + Feb + Mar + Apr + May + Jun +
             Jul + Aug + Sep + Oct + Nov + Dec) |> 
    mutate(EIA_ID = as.integer(EIA_ID),
           year = as.integer(year)) |> 
    # these data are in KWh. Convert to MWh:
    mutate(across(is.double, function(x) x * 1e-3)) ->
    nonutil_2000
  
  nonutil_1989_1998 |> 
    bind_rows(nonutil_1999) |> 
    bind_rows(nonutil_2000) ->
    nonutil_1989_2000
  
  1980:2000L |>
    map_dfr(function(yr){
            
      # years 1980 -> 1995 all utilities reported monthly...
      # ... (see https://www.eia.gov/electricity/data/eia923/eia906u.php)
      if(yr %in% 1980:1995){
        
        suppressWarnings(
          read_xls(paste0(gnr_dir, "f759",
                          yr, "u.xls"))
          ) |> 
          mutate(PCODE = as.integer(PCODE)) |>
          select(EIA_ID = PCODE, starts_with("GEN"), FUELDESC, PMDESC) |> 
          filter(EIA_ID %in% target_plants, FUELDESC == "WAT", PMDESC == "HY") |>
          select(-FUELDESC, -PMDESC) |> 
          pivot_longer(!EIA_ID, names_to = "month", values_to = "gen_MWh") |>
          mutate(month = month(as.integer(substr(month, 4, 5)), label = T)) -> 
          plant_gen_monthly
        
        plant_gen_monthly |>
          summarise(gen_MWh = sum(gen_MWh),
                    .by = EIA_ID) -> plant_gen_annual
        
        plant_gen_monthly |> 
          pivot_wider(names_from = "month", values_from = gen_MWh) |> 
          left_join(plant_gen_annual, by = join_by(EIA_ID)) |> 
          rename(TOTAL = gen_MWh) |> 
          bind_rows(
            filter(nonutil_1989_2000, year == !!yr) |> select(-year)) ->
          plant_gen_all
        
      }
      
      # years 1996 -> 2000 many utilities reported monthly only...
      # ...and no statistical imputation was performed.
      
      if(yr %in% 1996:2000){

        suppressWarnings(
          read_xls(paste0(gnr_dir, "f759", yr, "u.xls"))
          ) |>
          mutate(PCODE = as.integer(PCODE)) |>
          select(EIA_ID = PCODE, NETGENERAT, FUELDESC, PMDESC) |>
          filter(EIA_ID %in% target_plants, FUELDESC == "WAT", PMDESC == "HY") |> 
          select(-FUELDESC, -PMDESC) ->
          plant_gen_annual
   
        suppressWarnings(
          read_xls(paste0(gnr_dir, "f759", yr, "mu.xls"))
        ) |> 
          mutate(PCODE = as.integer(PCODE)) |>
          filter(FREQUENCYF == "M", FUELDESC == "WAT", PMDESC == "HY") |> 
          select(EIA_ID = PCODE, starts_with("GEN")) |>
          filter(EIA_ID %in% target_plants) |> 
          pivot_longer(!EIA_ID, names_to = "month", values_to = "gen_MWh") |>
          mutate(month = month(as.integer(substr(month, 4, 5)), label = T)) ->
          plant_gen_monthly
        
        expand.grid(EIA_ID = target_plants, month = 1:12L) |> 
          as_tibble() |> mutate(month = month(month, label = T)) |> 
          left_join(plant_gen_monthly, by = join_by(EIA_ID, month)) |> 
          pivot_wider(names_from = "month", values_from = gen_MWh) |> 
          left_join(plant_gen_annual, by = join_by(EIA_ID)) |> 
          rename(TOTAL = NETGENERAT) ->
          plant_gen_all
        
      }

      tibble(EIA_ID = target_plants) |>
        left_join(plant_gen_all, by = join_by(EIA_ID)) |>
        mutate(year = as.integer(!!yr)) -> gen_table
        
      # unit correction for 1980->1989
      if(!(yr %in% 1980:1989)) return(gen_table)
      
      return(
        gen_table |> 
          mutate_if(is_double, function(x) x * 1e-3)
      )
      
    }) ->
    gen_1980_2000

  2001:2022 %>%
    map_dfr(function(yr){

      if(yr %in% 2001:2002){
        
        file_name <- paste0(gnr_dir, "f906920_", yr, "/f906920_", yr, ".xls")
        
        # read EIA file
        suppressWarnings(
          read_xls(file_name, skip = 7, .name_repair = "unique_quiet") |>
            rename(eia_id = `Plant ID`) |>
            mutate(eia_id = as.integer(eia_id)) -> EIA_data
        )
        
        # filter EIA file for hydro
        EIA_data %>%
          filter(eia_id %in% target_plants,
                 `Reported Fuel Type Code` == "WAT",
                 `AER Fuel Type Code` == "HYC") %>%
          select(EIA_ID = eia_id,
                 one_of(paste0("NETGEN_", str_to_upper(month.abb))),
                 TOTAL = `NET GENERATION (megawatthours)`) ->
          EIA_hydro
        
        return(
          tibble(EIA_ID = target_plants)  |>
            left_join(EIA_hydro, by = "EIA_ID") |>
            mutate(year = !!yr)
        )
      }
      
      if(yr %in% 2003:2007){
        
        file_name <- paste0(gnr_dir, "f906920_", yr, "/f906920_", yr, ".xls")
        
        # read EIA file
        suppressWarnings(
          read_xls(file_name, skip = 7, .name_repair = "unique_quiet") |>
            rename(EIA_ID = `Plant ID`) ->
            EIA_data
        )
        
        # filter EIA file for hydro
        EIA_data |> 
          filter(EIA_ID %in% target_plants,
                 `Reported Fuel Type Code` == "WAT",
                 `AER Fuel Type Code` == "HYC",
                 `Reported Prime Mover` == "HY") |> 
          select(EIA_ID,
                 one_of(paste0("NETGEN_", str_to_upper(month.abb))),
                 TOTAL = `NET GENERATION (megawatthours)`) ->
          EIA_hydro
        
        return(
          tibble(EIA_ID = target_plants) |>
            left_join(EIA_hydro, by = "EIA_ID") |>
            mutate(year = !!yr)
        )
      }
      
      if(yr %in% 2008:2022){
        
        if(yr %in% 2008:2010){
          file_name <- paste0(gnr_dir, "f923_", yr,
                              "/EIA923 SCHEDULES ", yr, ".xls")
          suppressWarnings(
            read_xls(file_name, skip = 7, .name_repair = "unique_quiet") |>
              rename(EIA_ID = `Plant ID`) ->
              EIA_data
          )
          
        }
        
        if(yr %in% 2011:2020){
          file_name <- paste0(gnr_dir, "f923_", yr,
                              "/EIA923 SCHEDULES ", yr, ".xlsx")
          suppressWarnings(
            read_xlsx(file_name, skip = 5, .name_repair = "unique_quiet") %>%
              rename(EIA_ID = `Plant Id`) ->
              EIA_data
          )
        }

        if(yr == 2021){
          file_name <- paste0(gnr_dir, "f923_", yr,
                              "/EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx")
          suppressWarnings(
            read_xlsx(file_name, skip = 5, .name_repair = "unique_quiet")  |>
              rename(EIA_ID = `Plant Id`) ->
              EIA_data
          )
        }
        
        if(yr == 2022){
          file_name <- paste0(gnr_dir, "f923_", yr,
                              "/EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx")
          suppressWarnings(
            read_xlsx(file_name, skip = 5, .name_repair = "unique_quiet")  |>
              rename(EIA_ID = `Plant Id`) ->
              EIA_data
          )
        }
                
        if(!(yr %in% c(2012, 2014:2022))){
          EIA_data  |>
            filter(EIA_ID %in% target_plants,
                   `Reported Fuel Type Code` == "WAT",
                   `AER Fuel Type Code` == "HYC",
                   `Reported Prime Mover` == "HY") ->
            EIA_data_filtered
        }else{
          EIA_data  |>
            filter(EIA_ID %in% target_plants,
                   `Reported\r\nFuel Type Code` == "WAT",
                   `AER\r\nFuel Type Code` == "HYC",
                   `Reported\r\nPrime Mover` == "HY") |> 
            rename(NETGEN_JAN = `Netgen\r\nJanuary`,
                   NETGEN_FEB = `Netgen\r\nFebruary`,
                   NETGEN_MAR = `Netgen\r\nMarch`,
                   NETGEN_APR = `Netgen\r\nApril`,
                   NETGEN_MAY = `Netgen\r\nMay`,
                   NETGEN_JUN = `Netgen\r\nJune`,
                   NETGEN_JUL = `Netgen\r\nJuly`,
                   NETGEN_AUG = `Netgen\r\nAugust`,
                   NETGEN_SEP = `Netgen\r\nSeptember`,
                   NETGEN_OCT = `Netgen\r\nOctober`,
                   NETGEN_NOV = `Netgen\r\nNovember`,
                   NETGEN_DEC = `Netgen\r\nDecember`) ->
            EIA_data_filtered
        }
        
        names(EIA_data_filtered) <- str_to_upper(names(EIA_data_filtered))

        EIA_data_filtered |> 
            select(EIA_ID,
                   one_of(paste0("NETGEN_", str_to_upper(month.abb))),
                   any_of(c("NET GENERATION (MEGAWATTHOURS)",
                            "NET GENERATION (MEGAWATTHOURS)",
                            "NET GENERATION\r\n(MEGAWATTHOURS)"))) ->
            EIA_hydro
        
        
        names(EIA_hydro)[14] <- "TOTAL"

        
        return(
          tibble(EIA_ID = target_plants)  |>
            left_join(EIA_hydro, by = "EIA_ID") |>
            mutate(EIA_ID = as.integer(EIA_ID)) |> 
            mutate(year = as.integer(!!yr)) |> 
            mutate_if(is.character, as.double)
        )
      }
    }) -> gen_2001_2022

  names(gen_2001_2022) <- names(gen_1980_2000)
  
  # generate table of hrs per year for computation of maximum output
  tibble(
    date = seq.Date(from = ymd("1980-01-01"), to = ymd("2022-12-31"), by = 1)
  ) |>
    mutate(year = as.integer(year(date))) |>
    summarise(n_hrs = n() * 24, .by = year) ->
    hrs_per_year
  
  # combine nameplate and generation, then add capacity factor column
  bind_rows(
    gen_1980_2000,
    gen_2001_2022
  ) |>
    left_join(plant_nameplate_MW_interpolated,
              by = join_by(EIA_ID, year)) |>
    left_join(hrs_per_year, join_by(year)) |>
    mutate(cap_MWh = nameplate * n_hrs,
           Annual_CF = TOTAL / cap_MWh) ->
    gen_aCF_1980_2022
  
  return(gen_aCF_1980_2022)
}
