library(dataRetrieval)
library(dplyr)
library(lubridate)
library(readr)
library(purrr)
library(arrow)
library(stringr)
library(tidyr)

hil_changes <- read_csv("data/misc/HILARRI_changes.csv", show_col_types = FALSE)

# ---- Unique list of 8-digit gages ----
gages_to_pull <- hil_changes %>%
  filter(!is.na(usgs_gage_new)) %>%
  transmute(usgs_gage = str_pad(str_remove(as.character(usgs_gage_new), "^USGS-"),
                                width = 8, side = "left", pad = "0")) %>%
  distinct(usgs_gage) %>%
  pull()

# ---- Target date range (1980-01-01 to 2019-12-31) ----
date_seq <- tibble(date = seq.Date(ymd("1980-01-01"), ymd("2019-12-31"), by = "1 day"))

# ---- Fetch one gage safely; ensure unique per date within series ----
pull_one <- possibly(function(x) {
  message(x)
  if (nchar(x) < 8) return(tibble())

  raw <- readNWISdv(x, parameterCd = "00060")
  if (nrow(raw) == 0) return(tibble())

  col_q <- "X_00060_00003"
  if (!col_q %in% names(raw)) return(tibble())

  as_tibble(raw) %>%
    transmute(date = Date, flow_cfs = as.numeric(.data[[col_q]])) %>%
    filter(date >= ymd("1980-01-01"), date <= ymd("2019-12-31")) %>%
    # if NWIS ever returns same date multiple times, keep first non-NA
    arrange(date) %>%
    group_by(date) %>%
    summarise(flow_cfs = if (any(!is.na(flow_cfs))) first(flow_cfs[!is.na(flow_cfs)]) else NA_real_,
              .groups = "drop") %>%
    right_join(date_seq, by = "date") %>%
    mutate(usgs_gage = x, .before = 1)
}, otherwise = tibble())

# ---- Pull all gages ----
all_data_list <- map(gages_to_pull, pull_one)

# ---- Stack & de-dup new data on (usgs_gage, date) ----
new_data <- bind_rows(all_data_list) %>%
  mutate(usgs_gage = as.character(usgs_gage)) %>%
  arrange(usgs_gage, date) %>%
  group_by(usgs_gage, date) %>%
  summarise(flow_cfs = if (any(!is.na(flow_cfs))) first(flow_cfs[!is.na(flow_cfs)]) else NA_real_,
            .groups = "drop")

# (Optional) write the “new gages” parquet
write_parquet(new_data, "data/USGS/p00060_dv_1980_2019_new_gages.parquet")

# ---- Read existing (if present) and de-dup it too ----
existing_path <- "data/USGS/p00060_dv_1980_2019.parquet"
old_data <- if (file.exists(existing_path)) read_parquet(existing_path) else
  tibble(usgs_gage = character(), date = as.Date(character()), flow_cfs = numeric())

old_data <- old_data %>%
  mutate(usgs_gage = as.character(usgs_gage)) %>%
  arrange(usgs_gage, date) %>%
  group_by(usgs_gage, date) %>%
  summarise(flow_cfs = if (any(!is.na(flow_cfs))) first(flow_cfs[!is.na(flow_cfs)]) else NA_real_,
            .groups = "drop")

# ---- Combine with precedence to NEW data; guarantee uniqueness ----
final_data <- old_data %>%
  anti_join(new_data %>% select(usgs_gage, date), by = c("usgs_gage", "date")) %>%  # drop old rows that new replaces
  bind_rows(new_data) %>%
  arrange(usgs_gage, date)

# ---- Write final parquet (no duplicate (gage,date) rows) ----
write_parquet(final_data, existing_path)

# ---- Report skipped gages (no data returned) ----
skipped <- gages_to_pull[map_int(all_data_list, nrow) == 0]
print(skipped)
