# ==============================================================
# RectifHydPlus Version 1.1. Reproducible data pipeline
# ==============================================================
# Creator: Sean Turner (Oak Ridge National Laboratory)
# Developers: Sean Turner & A.B. Siddik (Oak Ridge National Laboratory)
# Contact: turnersw@ornl.gov
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Load targets library and set target options, including libraries required:
library(targets)
tar_option_set(
  packages = c("tibble",       # tidy data tables
               "vroom",        # fast csv reading
               "tidyr",        # data wrangling tools
               "readr",        # data reading
               "dplyr",        # data wrangling tools
               "arrow",        # reading and writing parquet and arrow files
               "sf",           # geospatial tools
               "lubridate",    # dates and times
               "zoo",          # time series tools (smoothing and interpolation)
               "stringr",      # string manipulation tools
               "purrr",        # efficient function mapping
               "readxl",       # read MS Excel files
               "foreign",      # read .dbf files (EIA data files)
               "nhdplusTools", # work with NHDplus river reaches 
               "starfit"       # simulate reservoir operations
              ),
  format = "feather",
  memory = "transient",
  garbage_collection = TRUE
)

# Run the R scripts in the R/ folder:
tar_source()

# Set capacity cutoff (plants with capacity >= than this value will be omitted):
capacity_cutoff_MW <- 10

# Set start and end years for EIA data pull:
earliest_EIA_year <- 1980L
latest_EIA_year <- 2022L
reanalysis_years <- 1980:2019

# Set priority order for flow proxies used in RectifHydPlus:
flow_priority <-
  tibble::tribble(
    ~rank, ~data_from, ~quality_label,
    1L, "ResOps", "A. Observed reservoir release (best proxy)",
    2L, "downstream_USGS_gauge", "B. Observed river discharge downstream of dam (good proxy)",
    4L, "Dayflow (RoR only)", "C. Simulated streamflow, run-of-river plants (moderate proxy)",
    3L, "ISTARF", "D. Simulated reservoir release (moderate proxy)",
    5L, "HUC4 outlet gauge", "E. Observed river discharge near the HUC4 outlet (weak proxy)",
    6L, "Fractions copied", "F1. Fractions copied from similar flow year (weakest proxy)"
  )

# Begin data pipeline:
list(
  # identify EHA data file
  tar_target(
    EHA_shp,
    "data/ORNL_EHAHydroPlant_PublicFY2024/Plant_external2024.shp",
    format = "file" 
  ),
  # read EHA and save as target 
  tar_target(
    EHA,
    read_EHA(EHA_shp)
  ),
  # identify HESC data file
  tar_target(
    HESC_csv,
    "data/HESC_v2/HESC_V2_Core_Characteristics.csv",
    format = "file"
  ),
  # read HESC and save as target
  tar_target(
    HESC,
    read_csv(HESC_csv, show_col_types = FALSE),
  ),
  # identify file to map COMID to USGS Gauge IDs
  tar_target(
    dayflow_gauge_table,
    "data/Gauge_COMID_20210409.csv",
    format = "file" 
    ),
  # read and clean COMID-USGS mapping file and save as target
  tar_target(
    dayflow_gauge_table_clean,
    clean_dayflow_gauge_table(dayflow_gauge_table)
  ),
  # identify dayflow (annual) data file
  tar_target(
    Dayflow_hydro_dams,
    "data/DayFlow/DayFlow_dams_annual_BCM_AORC.parquet",
    format = "file" 
  ),
  # identify dayflow (daily) points of interest file
  tar_target(
    Dayflow_hydro_dams_daily_POI,
    "data/DayFlow/VIC_RAPID_DaymetAORC2019_POI_flows_cfs.parquet",
    format = "file" 
  ),
  # identify supplemental dayflow (daily) flow file
  tar_target(
    Dayflow_hydro_dams_daily_supplement,
    "data/DayFlow/VIC_RAPID_DaymetAORC2019_SUP_flows_cfs.parquet",
    format = "file" 
  ),
  # read dayflow (daily) files, combine, and save as target
  tar_target(
    Dayflow_hydro_dams_daily,
    combine_dayflow_data(Dayflow_hydro_dams_daily_POI,
                         Dayflow_hydro_dams_daily_supplement)
    ),
  # identify COMID corrections file
  tar_target(
    COMID_corrections,
    "data/COMID_corrections.csv",
    format = "file" 
  ),
  # identify HILARRI version 3 file
  tar_target(
    HILARRI_gpkg,
    "data/HILARRI_v3/HILARRI_v3_preliminary.gpkg",
    format = "file"
  ),
  # identify HILARRI amendments file
  tar_target(
    HILARRI_amendments,
    "data/misc/HILARRI_changes.csv",
    format = "file"
  ),
   # read HILARRI, covert to tibble, and save as target 
  tar_target(
    HILARRI,
    read_HILARRI(HILARRI_gpkg, HILARRI_amendments)
  ),

  # identify USGS daily flows file (downstream of dams)
  tar_target(
    USGS_flow_parquet,
    "data/USGS/p00060_dv_1980_2019.parquet",
    format = "file"
  ),
  # identify USGS daily flows file (HUC4 outlets)
  tar_target(
    USGS_flow_HUC4outlet_parquet,
    "data/USGS/HUC4outlet_p00060_dv_1980_2019.parquet",
    format = "file"
  ),
  # identify ResOpsUS data directory
  tar_target(
    ResOps_dir,
    "data/ResOpsUS",
    format = "file"
  ),
  # identify ResOpsUS daily release data file
  tar_target(
    ResOpsUSrelease_csv,
    "data/ResOpsUS/time_series_single_variable_table/DAILY_AV_OUTFLOW_CUMECS.csv",
    format = "file"
  ),
  # identify ISTARF data file
  tar_target(
    ISTARF_dir,
    "data/ISTARF",
    format = "file"
  ),
  # identify directory with EIA generation data files
  tar_target(
    EIA_529_906_920_923,
    "./data/EIA/Generation/",
    format = "file"
  ),
  # identify directory with EIA plant data files
  tar_target(
    EIA_860,
    "./data/EIA/Plant/",
    format = "file"
  ),
  # California Energy Commission generation data (QFER)
  tar_target(
    QFER_generation_csv,
    "data/QFER_CEC_1304/QFERGeneratorMonthly_Table.csv",
    format = "file"
  ),
  tar_target(
    QFER_mapping_csv,
    "data/QFER_CEC_1304/Cross_Reference_Table.csv",
    format = "file"
  ),
  # define target plants (EIA IDs)
  # (function defined in "/R/EIA_xl_readers.R")
  tar_target(
    target_plants,
    identify_target_plants(gnr_dir = EIA_529_906_920_923,
                           plt_dir = EIA_860),
  ),
  # combine all EIA data files for generation and capacity
  # (function defined in "/R/EIA_xl_readers.R")
  tar_target(
    gen_aCF_1980_2022,
    get_EIA_monthly_gen(gnr_dir = EIA_529_906_920_923,
                        plt_dir = EIA_860,
                        target_plants = target_plants[["EIA_ID"]])
  ),
  # assign collection frequency label to each plant and year
  # (function defined in "/R/assign_frequency.R")
  tar_target(
    gen_1980_2022_monthly_with_freq,
    assign_data_frequency(gnr_dir = EIA_529_906_920_923,
                          gen_aCF_1980_2022)
    ),
  # link RHID to hydrologic datasets and dam/reservoir ...
  # ... specifications, in addition to downstream USGS gauge IDs
  # (function defined in "/R/map_to_water.R")
  tar_target(
    target_plants_mapped_to_water,
    map_reservoir_and_reach(EHA = EHA,
                            HILARRI = HILARRI,
                            HESC = HESC,
                            COMID_corrections = COMID_corrections,
                            target_plants = target_plants)
    ),
  # extract reservoir release data and assign to plants
  # (function defined in "/R/map_to_water.R")
  tar_target(
    resOps_releases,
    get_ResOps_releases(target_plants_mapped_to_water,
                        ResOpsUSrelease_csv)
  ),
  # create annual flow information for all plants
  # (function defined in "/R/Annual CF model.R")
  tar_target(
    annual_flow_all_plants,
    prep_annual_inflow(target_plants_mapped_to_water,
                       gen_aCF_1980_2022,
                       Dayflow_hydro_dams,
                       resOps_releases)
  ),
  # combine annual CF and annual flow
  # (function defined in "/R/Annual CF model.R")
  tar_target(
    annual_energy_CF_water,
    prep_annual_energy_CF_water(target_plants_mapped_to_water,
                                gen_aCF_1980_2022,
                                annual_flow_all_plants)
  ),
  # parameterize annual flow to CF model for each plant
  # (function defined in "Annual CF model.R")
  tar_target(
    annual_CF_models,
    fit_annual_CF_models(annual_energy_CF_water)
  ),
  #
  # get daily water release by various methods:
  # 1. ResOpsUS (preferred)
  # 2. HILARRIv3 (next best)
  # 3. ISTARF (not ideal)
  # 4. Inflow = Outflow (RoR only)
  # 5. Copycat
  # (functions defined in "/R/daily release all methods.R")
  tar_target(
    release_by_ResOps_method,
    RM1_ResOps(target_plants_mapped_to_water,
               resOps_releases)
  ),
  tar_target(
    release_by_USGS_method,
    RM2_HILARRI(target_plants_mapped_to_water,
                USGS_flow_parquet)
  ),
  tar_target(
    release_by_ISTARF_method,
    RM3_ISTARF(target_plants_mapped_to_water,
               ISTARF_dir,
               ResOps_dir,
               daily_releases = Dayflow_hydro_dams_daily)
  ),
  tar_target(
    release_by_RoR_method,
    RM4_RoR(target_plants_mapped_to_water,
            daily_releases = Dayflow_hydro_dams_daily)
  ),
  tar_target(
    release_by_HUC4outlet_method,
    RM5_HUC4outlet(target_plants_mapped_to_water,
                   USGS_flow_HUC4outlet_parquet)
  ),
  tar_target(
    daily_release_files,
    prep_daily_release_files(release_by_ResOps_method,
                            release_by_USGS_method,
                            release_by_ISTARF_method,
                            release_by_RoR_method,
                            release_by_HUC4outlet_method),
    format = "file"
  ),
  # link daily release data from all methods to each dam
  # (function defined in "/R/assign daily release to each dam")
  tar_target(
    daily_release_selected,
    assign_daily_release(target_plants_mapped_to_water,
                         daily_release_files)
  ),
  # combine all sources of release data and write out to temporary parquets
  # (function defined in "/R/assign daily release to each dam.R")
  tar_target(
    daily_release_selected_by_dam,
    prep_selected_release_files(daily_release_selected),
    format = "file"
  ),
  # read observed generation data from CEC and map to EIA ID
    tar_target(
      QFER_gen,
      read_csv(QFER_generation_csv, show_col_types = FALSE) |>
        left_join(read_csv(QFER_mapping_csv, show_col_types = FALSE) |>
                    select(CECPlantID, EIAPlantID, `Energy Source Category`) |>
                    distinct(), by = "CECPlantID") |>
      filter(
        # filter for hydro plants
        `Energy Source Category` == "WATER",
        # remove Hoover (already observed monthly in EIA-923)
        EIAPlantID != "8902"
        )
  ),
  # convert EIA and CEC obs generation data to RHP format...
  # ...for use in spill quantile calibration
  # (function defined in "/R/spill quantile determination.R")
  tar_target(
    gen_1980_2019_monthly_obs_by_RHPID,
    aggregate_EIA_gen_to_RHPID(target_plants_mapped_to_water,
                              gen_1980_2022_monthly_with_freq,
                              QFER_gen)
  ),
  # optimize the spill quantile for each dam
  # (function defined in "/R/spill quantile determination.R")
  tar_target(
    spill_quantiles,
    get_spill_quantile(gen_1980_2019_monthly_obs_by_RHPID,
                       daily_release_selected_by_dam)
  ),
  # use spill quantile to detemine daily powered release
  # (function defined in "/R/spill quantile determination.R")
  tar_target(
    daily_penstock_release,
    get_daily_penstock_release(spill_quantiles,
                               daily_release_selected_by_dam),
    format = "file"
  ),
  # get monthly fractions for downscaling
  # (function defined in "/R/release to generation.R")
  tar_target(
    monthly_flow_fractions,
    penstock_release_to_monthly_fractions(target_plants_mapped_to_water,
                                          daily_penstock_release)
  ),
  # define supplementary fractions for cases where release is missing
  # (function defined in "/R/release to generation.R")
  tar_target(
    monthly_supp_fractions,
    nearest_yr_gen_to_monthly_fractions(annual_energy_CF_water,
                                        gen_1980_2019_monthly_obs_by_RHPID)
  ),
  # perform downscaling from annual to monthly
  # (function defined in "/R/release to generation.R")
  tar_target(
    monthly_ActGen_CCfGen_ObsGen_1980_2019,
    disaggregate_gen(monthly_flow_fractions,
                     monthly_supp_fractions,
                     gen_1980_2019_monthly_obs_by_RHPID,
                     annual_energy_CF_water,
                     annual_CF_models)
  ),
  # create final data tables and write output
  # (function defined in "/R/release to generation.R")
  tar_target(
    RectifHydPlus,
    select_final_gen(target_plants_mapped_to_water,
                     annual_energy_CF_water,
                     monthly_ActGen_CCfGen_ObsGen_1980_2019,
                     QFER_gen)
  )
)