# Make maps of data quality issues by CBSA

library(tidyverse)
library(sf)
library(tigris)
library(Hmisc)

dir.create('output/nation_maps/dq_maps')

cntys = counties(cb = T, resolution = '20m',year = 2020) %>% shift_geometry() %>% filter(as.numeric(STATEFP) < 60)


## 2020 CBSAs
cbsa = read.csv('nonproprietary_data/msa_crosswalks/list1_2020.csv')

# Clean up CBSA
cbsa = mutate(cbsa, st = str_pad(FIPS.State.Code, 2,pad = '0'),
              cnty = str_pad(FIPS.County.Code, 3, pad = '0'),cbsa = str_pad(CBSA.Code,5,pad = '0'),
              csa = str_pad(CSA.Code, 3,pad = '0'),
              micro = Metropolitan.Micropolitan.Statistical.Area == 'Micropolitan Statistical Area') %>% select(st,cnty,cbsa,csa,micro)

# Bring in 1990 CZs
czs = read.csv('nonproprietary_data/msa_crosswalks/cty_cz_st_crosswalk.csv')
czs = mutate(czs,
             cz = str_pad(cz, 5,pad = '0'),
             st = str_pad(state_fips,2,pad = '0'),
             fips = str_pad(cty,5,pad = '0'),
             cnty = str_sub(fips,3,5)) %>% select(st,cnty,cz)


## Fix missing counties - cases where county name/fips in 2020 Census
## Differs from that in 1990 CZ data - taking info from https://www.census.gov/programs-surveys/geography/technical-documentation/county-changes.1990.html#list-tab-957819518

cntys = mutate(cntys, czcounty = case_when(
  STATEFP == '08' & COUNTYFP == '014' ~ '013' , #Broomfield county from Boulder County
  STATEFP == '12' & COUNTYFP == '086' ~ '025' ,  # Miami Dade county was just Dade county
  STATEFP == '46' & COUNTYFP == '102' ~ '113', # Oglala Lakota County from Shannon County
  STATEFP == '15' & COUNTYFP == '005' ~ '009', # Treat Kalawao county as part of Maui county since not in CZ crosswalk
  
  # Lots of Alaska changes
  STATEFP == '02' & COUNTYFP == '063' ~ '261',
  STATEFP == '02' & COUNTYFP == '066' ~ '261',
  STATEFP == '02' & COUNTYFP == '068' ~ '290',
  STATEFP == '02' & COUNTYFP == '105' ~ '231',
  STATEFP == '02' & COUNTYFP == '158' ~ '261',
  STATEFP == '02' & COUNTYFP == '195' ~ '280',
  STATEFP == '02' & COUNTYFP == '198' ~ '201',
  STATEFP == '02' & COUNTYFP == '230' ~ '231',
  STATEFP == '02' & COUNTYFP == '275' ~ '280',
  STATEFP == '02' & COUNTYFP == '282' ~ '231', 
  
  T ~ COUNTYFP
  
)
)


cntys = left_join(cntys, cbsa, by = c('STATEFP' = 'st','COUNTYFP' = 'cnty'))
cntys = left_join(cntys, czs, by = c('STATEFP' = 'st','czcounty' = 'cnty'))

cntys = mutate(cntys, 
               cbsa = case_when(
                 !is.na(cbsa) ~ cbsa,
                 is.na(cbsa) ~ STATEFP
               ),
               csa = case_when(
                 !is.na(csa) ~ csa,
                 is.na(csa) ~ STATEFP
               ),
 
               
               )


for(bigunit in c('cbsa','csa','cz')){ 
  
    # Get aggregated shape 
    cntys$bigint = cntys %>% data.frame() %>% select(all_of(bigunit)) %>% unlist()
    mapshp = cntys %>% group_by(bigint) %>% summarise(cnt = length(STATEFP))
    
  for(smallunit in c('loc2','loc','school')) { 
    
    # Bring in FU data
    fudta = read_csv(paste0('intermediate_data/fu_prop/fu_',bigunit, '_', smallunit,'.csv'))
    
    if(bigunit == 'csa') fudta$bigint = str_pad(fudta$bigint,3,pad ='0')
    if(bigunit %in% c('cz','cbsa')) fudta$bigint = str_pad(fudta$bigint,5,pad ='0')
    
    fudta %>% filter(fl_pop40 == T) %>% arrange(-cenpop) %>% select(st,fuint,fuintname, clpop, cenpop)
    fudta %>% filter(fl_pop40 == T) %>% count(st) %>% arrange(-n)
    fudta %>% group_by(st, fl_pop40) %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    fudta %>% group_by(fl_pop40) %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    fudta %>% filter(st == '46') %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    fudta %>% filter(st == '46', fl_pop40 == T) %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    
    fudta %>% filter(fl_pop40 == F) %>% count(fl_mv40_tot)
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == T) %>% select(st,fuint,fuintname, clpop, cenpop) %>% arrange(-cenpop)
    fudta %>% filter(fl_pop40 == F) %>% group_by(fl_mv40_tot) %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    
    # Val 2k
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == F) %>% count(fl_val2k_tot)
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == F, fl_val2k_tot == T) %>% select(st,fuint,fuintname, clpop, cenpop) %>% arrange(-cenpop)
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == F, fl_val2k_tot == F) %>% select(pc_value_tot) %>% summary()
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == F) %>% group_by(fl_val2k_tot) %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    
    # Tax rate 
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == F, fl_val2k_tot == F) %>% count(fl_tr15_tot, fl_taxgt10k_tot)
    fudta %>% filter(fl_pop40 == F, fl_mv40_tot == F, fl_val2k_tot == F, fl_tr15_tot == T, fl_taxgt10k_tot == F) %>% select(st,fuint,fuintname, clpop, cenpop,tax_tot) %>% arrange(-cenpop)
    
    fudta %>% group_by(cleansample_val_tot) %>% summarise(cnt = length(st), pop = sum(cenpop)) %>% print(n = 120)
    
    
    # Aggregate FUs to metros. For each flag have categories for 0%, 1%-25%, >25% of population affected
    mfudta = fudta %>% group_by(bigint) %>% summarise(
      across(c(starts_with('fl_')), ~ wtd.mean(.x,clpop),.names = 'ppc_{.col}'),
      across(c(fl_pop40,starts_with('cleansample_')), ~ wtd.mean(.x, cenpop),.names = 'cppc_{.col}'),
      across(c(starts_with('cppc_'),starts_with('ppc_')), ~ case_when(
        (.x == 0 & !is.na(.x)) | is.na(.x) ~ '0% or missing',
        .x > 0 & .x <= .1  & !is.na(.x) ~ '1-10%',
        .x > .1 & .x <= .25  & !is.na(.x) ~ '10-25%',
        .x > .25 & .x < 1 & !is.na(.x) ~ '25%-99%',
        .x == 1 & !is.na(.x) ~ 'Full 100%'
        ), .names = 'pcat_{.col}'),

      pcat_cleansamp = case_when(
        (cppc_cleansample_val_tot == 0 & !is.na(cppc_cleansample_val_tot)) | is.na(cppc_cleansample_val_tot) ~ '0% or missing',
        cppc_cleansample_val_tot > 0 & cppc_cleansample_val_tot <= .6  & !is.na(cppc_cleansample_val_tot) ~ '1-60%',
        cppc_cleansample_val_tot > .6 & cppc_cleansample_val_tot <= .9  & !is.na(cppc_cleansample_val_tot) ~ '60-95%',
        cppc_cleansample_val_tot > .9 & cppc_cleansample_val_tot <= .95  & !is.na(cppc_cleansample_val_tot) ~ '90-95%',
        cppc_cleansample_val_tot > .95 & cppc_cleansample_val_tot < 1 & !is.na(cppc_cleansample_val_tot) ~ '95%-99%',
        cppc_cleansample_val_tot == 1 & !is.na(cppc_cleansample_val_tot) ~ 'Full 100%'
        
      ),
      popcl = sum(clpop *( cleansample_val_tot == T), na.rm = T),
      popcen = sum(cenpop),
      poppct = popcl / popcen,
      
    pcat_popcen = case_when(
      (poppct == 0 & !is.na(poppct)) | is.na(poppct) ~ '0% or missing',
      poppct > 0 & poppct <= .6  & !is.na(poppct) ~ '1-60%',
      poppct > .6 & poppct <= .9  & !is.na(poppct) ~ '60-95%',
      poppct > .9 & poppct <= .95  & !is.na(poppct) ~ '90-95%',
      poppct > .95 & !is.na(poppct) ~ 'Over 95%'

    )
    ) %>% ungroup()
    

    pcats = mfudta %>% select(starts_with('pcat_')) %>% names()
    
    mapd = left_join(mapshp, mfudta, by = 'bigint')
    
    # Make maps of each cateogry
    for(pcat in pcats){
      
      mapd$outint = mapd %>% data.frame() %>% select(all_of(pcat)) %>% unlist()
      
      pdf(paste0('output/nation_maps/dq_maps/map_dq_',bigunit,'_',smallunit,'_',pcat,'.pdf'),width = 12,height = 6)
      gplt = ggplot(mapd, aes(fill = outint)) + 
        geom_sf(size = .1, color = 'black') + # coord_sf(lims_method = 'geometry_bbox',default_crs = NULL) +
        scale_fill_brewer(name = 'DQ flag', palette = 'YlGnBu', direction = 1) + #type = 'qual') +
        theme(
          panel.background = element_blank(),
          panel.grid = element_blank(),
          axis.ticks = element_blank(),
          axis.text = element_blank(),
          plot.title = element_text(hjust = .5)
        ) 
      print(gplt)
      dev.off()
    
    
    
      
      # Create the overall map used in the appendix
      if(pcat %in% c('pcat_popcen')){
        
        pdf(paste0('output/nation_maps/dq_maps/map_dq_',bigunit,'_',smallunit,'_',pcat,'.pdf'),width = 12,height = 6)
        gplt = ggplot(mapd, aes(fill = outint)) + 
          geom_sf(size = .1, color = 'black') + # coord_sf(lims_method = 'geometry_bbox',default_crs = NULL) +
          scale_fill_brewer(name = paste('Fraction of',str_to_upper(bigunit), 'population\npresent in final data set'), palette = 'YlGnBu', direction = -1,na.value = 'grey90',
                            labels = c('0%','1-60%','60-90%','90-95%','>95%',paste0('Non-', str_to_upper(bigunit)))) + #type = 'qual') +
          theme(
            panel.background = element_blank(),
            panel.grid = element_blank(),
            axis.ticks = element_blank(),
            axis.text = element_blank(),
            plot.title = element_text(hjust = .5)
          ) 
        print(gplt)
        dev.off()
        
       if(bigunit == 'cbsa' & smallunit == 'loc2') ggsave('output/nation_maps/fig_a11_map_data_quality.pdf', plot = gplt)
        
      }
    }
    
    
  }
}
