# Make percentiles of property value for each CBSA

library(tidyverse); options(tibble.width = Inf)

dir.create('intermediate_data/cbsa_properties')
dir.create('intermediate_data/cbsa_properties/tractdta')
dir.create('intermediate_data/cbsa_properties/pctiles')

# Bring in CBSA data
msas = read.csv('nonproprietary_data/msa_crosswalks/list1_2020.csv')

msas = mutate(msas, st = str_pad(FIPS.State.Code, 2,pad = '0'),
              cnty = str_pad(FIPS.County.Code, 3, pad = '0'),cbsa = str_pad(CBSA.Code,5,pad = '0'),
              csa = str_pad(CSA.Code, 3,pad = '0'),
              micro = Metropolitan.Micropolitan.Statistical.Area == 'Micropolitan Statistical Area') %>% select(st,cnty,cbsa,csa,micro)

msas = filter(msas, !is.na(as.numeric(cbsa)), cbsa != '00000',st != '72')
cbsas = unique(msas$cbsa)

# Loop through CBSAs

cbsdone = NULL

for(cbint in cbsas[1:length(cbsas)]){
  
  cbdta = filter(msas, cbsa == cbint)
  states = unique(cbdta$st)
  
  propdta = NULL
  
  # Aggregate all the relevant counties across states
  for(stint in states){
    
    # Bring in clean property data
    load(paste0('intermediate_data/properties/properties_',stint,'.Rdata'))
    
    stprop = stprop %>% filter(exempt == F) %>% mutate(
                    st = substr(GEOID,1,2),
                    cnty = substr(GEOID,3,5),
                    tract = substr(GEOID,6,11))
    
    stprop = left_join(stprop, cbdta, by = c('st','cnty'))
    stprop = filter(stprop, !is.na(cbsa))
    
    propdta = bind_rows(propdta, stprop)
    
  }
  
  
  # Remove missing value data
  propdta = filter(propdta, !is.na(val_clean))

  if(nrow(propdta) == 0){next()}
  
  # Now compute percentiles of total value 
  valpcts = quantile(propdta$val_clean,probs = 1:100/100)
  
  resprops = propdta %>% filter(res == T)
  valrespcts = quantile(resprops$val_clean,probs = 1:100/100)
  
  # Get unique cuts and assign midpoint of percentile range covered by each
  valpctsclean = cbind(pct = 1:100,val = valpcts) %>% data.frame() %>% group_by(val) %>% summarise(pct = round(min(pct)),size = length(val))
  valrespctsclean = cbind(pct = 1:100,val = valrespcts) %>% data.frame() %>% group_by(val) %>% summarise(pct = round(min(pct)),size = length(val))
  
  if(nrow(resprops) == 0) valrespctsclean[1,'val'] = 1
  
  valpctsclean$cbsa = cbint
  valrespctsclean$cbsa = cbint
  
  write_csv(valpctsclean,paste0('intermediate_data/cbsa_properties/pctiles/pcts_all_',cbint,'.csv'))
  write_csv(valpctsclean,paste0('intermediate_data/cbsa_properties/pctiles/pcts_res_',cbint,'.csv'))

  # Now assign properties to percentiles
  propdta = mutate(propdta, 
                   pct_val = cut(val_clean,breaks = c(-1000000000, valpctsclean$val),labels = valpctsclean$pct),
                   pct_resval = case_when(res == T ~ cut(val_clean, breaks = c(-1000000000,valrespctsclean$val),labels = valrespctsclean$pct),
                                          res == F ~ NA)
  )
  
  stopifnot(!is.na(propdta$pct_val))
  stopifnot(!is.na(propdta$pct_resval) | propdta$res == F)
  
  # Save clean properties by CBSA
  save(propdta, file =  paste0('intermediate_data/cbsa_properties/cbsa_prop_',cbint,'.Rdata'))
  
  # Now aggregate to tract for segregation measurement 
  tractdta_all = propdta %>% group_by(st,cnty,tract,pct_val) %>% summarise(
    cnt = length(CLIP)
  )
  
  stopifnot(sum(tractdta_all$cnt) == nrow(propdta))
  
  write_csv(tractdta_all,paste0('intermediate_data/cbsa_properties/tractdta/all_tract_',cbint,'.csv'))
  
  # Now do residential tract counts
  tractdta_res = propdta %>% group_by(st,cnty,tract,pct_resval) %>% summarise(
    cnt = sum(res)
  )
  stopifnot(sum(tractdta_res$cnt) == nrow(filter(propdta, res == T)))
  
  write_csv(tractdta_res,paste0('intermediate_data/cbsa_properties/tractdta/res_tract_',cbint,'.csv'))
  
  cbsdone = rbind(cbsdone,cbint)
  
}

cbsdone = data.frame(cbsdone)
write_csv(cbsdone, 'intermediate_data/cbsa_properties/cbs_done.csv')
