## Assign data frequency to EIA records

## 1. 1980 -> 1995, all data are monthly
## 2. 1996 -> 2000, monthly where monthly are provided
## 3. 2011 -> 2022, use spreadsheet tab "Page 6 Plant Frame"
## 4. 2001 -> 2010, no respondent frequency given; use distance matrix ....
## ... and validate using periods with frequency designation.

assign_data_frequency <- function(gnr_dir,
                                  gen_aCF_1980_2022){
  
  ## 1. 1980 -> 1995

  gen_aCF_1980_2022 |> 
    filter(year %in% 1980:1995) |> 
    select(-nameplate, -n_hrs, -cap_MWh, -Annual_CF) |> 
    mutate(freq = "M") ->
    gen_freq_1980_1995
  
  ## 2. 1996 -> 2000
  
  gen_aCF_1980_2022 |> 
    filter(year %in% 1996:2000) |> 
    select(-nameplate, -n_hrs, -cap_MWh, -Annual_CF) |> 
    mutate(freq = if_else(is.na(Jan), "A", "M")) -> 
    gen_freq_1996_2000
  
  
  ## 3. 2011 -> 2022
  2011:2022L |> 
    map_dfr(function(yr){
      
      if(yr %in% 2011:2020){
        file_name <- paste0(gnr_dir, "f923_", yr,
                            "/EIA923 SCHEDULES ", yr, ".xlsx")
        suppressWarnings(
          read_xlsx(file_name, skip = 4, .name_repair = "unique_quiet",
                    sheet = "Page 6 Plant Frame") |>
            select(EIA_ID = one_of("EIA Plant Id", "Plant Id"),
                   freq = one_of("Reporting Frequency (Annual Or Monthly)",
                                 "Reporting\r\nFrequency",
                                 "Respondent\r\nFrequency")) ->
            freq_by_plant
        )
      }
      
      
      if(yr == 2021){
        file_name <- paste0(gnr_dir, "f923_", yr,
                            "/EIA923_Schedules_2_3_4_5_M_12_2021_Final_Revision.xlsx")
        suppressWarnings(
          read_xlsx(file_name, skip = 4, .name_repair = "unique_quiet",
                    sheet = "Page 6 Plant Frame")  |> 
            select(EIA_ID = `Plant Id`, freq = "Respondent\r\nFrequency") ->
            freq_by_plant
        )
      }
      
      if(yr == 2022){
        file_name <- paste0(gnr_dir, "f923_", yr,
                            "/EIA923_Schedules_2_3_4_5_M_12_2022_Final.xlsx")
        suppressWarnings(
          read_xlsx(file_name, skip = 4, .name_repair = "unique_quiet",
                    sheet = "Page 6 Plant Frame")  |> 
            select(EIA_ID = `Plant Id`, freq = "Respondent\r\nFrequency") ->
            freq_by_plant
        )
      }
      
      freq_by_plant |> 
        mutate(year = as.integer(yr))
      
    }) -> freq_by_plant_and_yr_2011_2022
  
  
  freq_by_plant_and_yr_2011_2022 |>
    mutate(freq = if_else(freq == "A", "A", "M")) |> 
    mutate(freq = if_else(EIA_ID == 10613 & year %in% 2020:2022,
                          "A", freq)) ->
    freq_by_plant_and_yr_2011_2022_AM
  
    gen_aCF_1980_2022 |> 
      filter(year %in% 2011:2022) |> 
      select(-nameplate, -n_hrs, -cap_MWh, -Annual_CF) |> 
      left_join(freq_by_plant_and_yr_2011_2022_AM,
                by = join_by(EIA_ID, year)) ->
      gen_freq_2011_2022
    
    
    ## 4. 2001 -> 2010

  # perform distance matrix analysis on all years

  gen_aCF_1980_2022 |>
    # years earlier than 2001 yeild no copies--confirming monthly resolution of...
    # ... of these data !
    filter(year >= 2001) |>
    # remove cases with all zero (will erroneously "match" other plants)
    filter(TOTAL != 0) |> 
    select(-nameplate, -n_hrs, -cap_MWh, -Annual_CF, -TOTAL) |> 
    # !! should not have to perform unique() here! Check previous target
    unique() |> 
    split(~year) |> map_dfr(function(gen_yr){
      
      gen_yr |> 
        pivot_longer(-c(EIA_ID, year)) |> 
        mutate(value = value/sum(value), .by = c(EIA_ID, year)) |> 
        pivot_wider() |> 
        filter(!is.na(Jan)) ->
        fractions_for_dist_analysis

      fractions_for_dist_analysis |>
        select(-EIA_ID, year) |> 
        dist() |> as.matrix() |> as.data.frame() |> 
        mutate(EIA_ID = fractions_for_dist_analysis$EIA_ID) -> dist_df

      dist_df |> 
        pivot_longer(-EIA_ID, names_to = "comparison_plant",
                     values_to = "distance") |> 
        left_join(tibble(EIA_compare = fractions_for_dist_analysis$EIA_ID,
                         comparison_plant = as.character(
                           1:nrow(fractions_for_dist_analysis))
                         ),
                  by = join_by(comparison_plant)) |> 
        select(EIA_ID, distance, EIA_compare) |> 
        filter(EIA_ID != EIA_compare) |> 
        filter(distance < 0.00001) |> 
        count(EIA_ID) |> mutate(year = first(gen_yr$year))

    }) ->
    copy_counts

  # validation of approach...
  copy_counts |> 
    #filter(EIA_ID == 93)
    #filter(n > 1) |> 
    # ggplot(aes(x = n)) +
    # geom_histogram() +
    # facet_wrap(~year)
    mutate(est_freq = "A") |> 
    select(EIA_ID, year, est_freq, n) |> 
    filter(year %in% 2011:2022) |> 
    right_join(gen_freq_2011_2022,
               by = join_by(EIA_ID, year)) |> 
    mutate(est_freq = if_else(is.na(est_freq), "M", est_freq)) |> 
    filter(est_freq != freq) |> 
    filter(TOTAL != 0,
           !is.na(TOTAL)) |> 
    filter(freq == "A") |> 
    count(EIA_ID) |> 
    filter(n > 1) |> pull(EIA_ID) ->
    revert_to_annual

    # notes:
    # 1574 should be "M" despite single pattern match.
    # xx should be added to annual if not there...
    
  copy_counts |> 
    filter(year %in% 2001:2010) |> 
    filter(!(EIA_ID %in% c(1574))) |> 
    select(EIA_ID, year) |> unique() |> 
    mutate(freq = "A") |> 
    right_join(
      gen_aCF_1980_2022 |> 
        filter(year %in% 2001:2010) |> 
        select(-nameplate, -n_hrs, -cap_MWh, -Annual_CF),
      by = join_by(EIA_ID, year)
    ) |> 
    mutate(freq = if_else(
      EIA_ID %in% c(revert_to_annual), "A", freq)
      ) |> 
    mutate(freq = if_else(is.na(TOTAL), "A", freq)) |> 
    mutate(freq = if_else(is.na(freq), "M", freq)) ->
    gen_freq_2001_2010
    
  bind_rows(
    gen_freq_1980_1995,
    gen_freq_1996_2000,
    gen_freq_2001_2010,
    gen_freq_2011_2022
  ) |> arrange(year, EIA_ID) |> 
    # case for no data
    mutate(freq = case_when(
      is.na(TOTAL) ~ "No data",
      freq == "M" ~ "Monthly survey",
      freq == "A" ~ "Annual survey")
      ) ->
    all_monthly_data_with_freq
  

  return(all_monthly_data_with_freq)

    
}
