# Calculate tax base fragmentation quotient by CBSA, CZ, and CSA

library(tidyverse)


dir.create('output/tfq')
dir.create('output/tfq/tfq_fu')
dir.create('clean_data')


# Make clean version of names for all three big units
msanames = read_csv('nonproprietary_data/msa_crosswalks/msa_names_2020.csv')
msanames = mutate(msanames, 
                  bigint = str_pad(CBSA.Code, 5, pad = '0'),
                  bigname = CBSA.Title,
                  bigshortname = shortname) %>% select(bigint,bigname,bigshortname)
csanames = read_csv('nonproprietary_data/msa_crosswalks/csa_names_2020.csv')
csanames = mutate(csanames, 
                  bigint = str_pad(csa, 3,pad = '0'),
                  bigname = csaname) %>% select(bigint,bigname)
cznames = read_csv('nonproprietary_data/msa_crosswalks/cznames_chetty2014.csv')
cznames = mutate(cznames, 
                 bigint = str_pad(CZ,5,pad = '0'),
                 bigname = `CZ Name`) %>% select(bigint, bigname)
cntynames = read_csv('nonproprietary_data/msa_crosswalks/US_FIPS_CountyCodes.csv')
cntynames = mutate(cntynames,
                   bigint = paste0('nc',str_pad(`FIPS Code`,5,pad = '0')),
                   bigname = paste0(`County Name`,' County, ',State),
                   bigshortname = `County Name`
                   ) %>% select(bigint,bigname,bigshortname)
msanames = bind_rows(msanames,cntynames)

for(bigunit in c('cbsa','cz','csa')) {

  for(smallunit in c('loc2','loc','school','stcnty')) {
    
    fudta = read_csv(paste0('intermediate_data/fu_prop/fu_',bigunit,'_',smallunit,'.csv'))
    fudta$nprop_nonres = fudta$nprop_tot - fudta$nprop_res
    
    if(bigunit == 'cbsa') fudta$bigint = str_pad(fudta$bigint,5,pad = '0')
    if(bigunit == 'csa') fudta$bigint = str_pad(fudta$bigint,3,pad = '0')
    if(bigunit == 'cz') fudta$bigint = str_pad(fudta$bigint,5,pad = '0')
    
    # Do separately for total and residential
    for(sint in c('tot','res','ag','oth')){ 
      
      if(sint %in% c('tot','res')) fudta$cleanflag = fudta %>% select(paste0('cleansample_val_',sint)) %>% unlist()
      if(sint %in% c('ag','oth','nonres')) fudta$cleanflag = fudta$cleansample_val_res == T & fudta$cleansample_val_tot == T
      fudta$value = fudta %>% select(paste0('value_',sint)) %>% unlist()
      fudta$valpc = fudta %>% select(paste0('pc_value_',sint)) %>% unlist()
      fudta$nprop = fudta %>% select(paste0('nprop_',sint)) %>% unlist()
      fudta$taxrt = fudta %>% select(paste0('fu_taxrt_',sint)) %>% unlist()
      
      # Limit to clean sample
      cldta = filter(fudta, cleanflag == T)
      stopifnot(cldta$clpop > 0) # All FUs have at least 1 resident
      
      # Calculate value per capita for whole CBSA
      bigudta = cldta %>% group_by(bigint) %>% summarise(
        totval = sum(value),
        totpop_cl = sum(clpop),
        ngovts = length(bigint),
        totpop_cen = sum(cenpop),
      ) #%>% ungroup()
      
      # Get CBSA level data
      bigudta = mutate(bigudta, 
                       bigpc = totval / totpop_cl,
                       govts100k = ngovts / totpop_cl * 100000
      )
      
      cldta = left_join(cldta, bigudta, by = 'bigint')
      write_csv(cldta, paste0('output/tfq/tfq_fu/fu_big_',bigunit,'_',smallunit,'_',sint,'.csv'))
      
      # Calculate difference
      cldta = mutate(cldta,
        dif_valpc = valpc - bigpc, 
        absdif_valpc = abs(dif_valpc),
        tosum_val = absdif_valpc * clpop
        )
    
      # Sum and export TFQ
      
      tfqdta = cldta %>% group_by(bigint) %>% summarise(
        pop = sum(clpop),
        val_tomove = sum(tosum_val),
        val_tot = sum(value),
        val_pc = mean(bigpc),
        nprop = sum(nprop),
        ngovts = mean(ngovts),
        govts100k = mean(govts100k),
      )
      
      tfqdta = mutate(tfqdta,
        tfq_val = val_tomove / val_tot / 2,
        )
      
      
    
      stopifnot(!is.na(tfqdta$tfq_val) | tfqdta$val_tot == 0) # Now that pop zero are dropped, only places missing have zero assessed value
      
      write_csv(tfqdta, paste0('output/tfq/raw_tfq_',bigunit, '_', smallunit,'_',sint,'.csv'))
    
    }
    
    # Re-combine into one clean dataset
    tfqtot = read_csv(paste0('output/tfq/raw_tfq_',bigunit, '_', smallunit,'_tot.csv')) %>% select(-val_tomove, -val_tot)
    tfqres = read_csv(paste0('output/tfq/raw_tfq_',bigunit, '_', smallunit,'_res.csv')) %>% select(-val_tomove, -val_tot)
    tfqag = read_csv(paste0('output/tfq/raw_tfq_',bigunit, '_', smallunit,'_ag.csv')) %>% select(-val_tomove, -val_tot)
    tfqoth = read_csv(paste0('output/tfq/raw_tfq_',bigunit, '_', smallunit,'_oth.csv')) %>% select(-val_tomove, -val_tot)

    tfqclean = full_join(tfqtot,tfqres, by = 'bigint',suffix = c('','_res'))
    tfqclean = full_join(tfqclean,tfqag, by = 'bigint',suffix = c('','_ag'))
    tfqclean = full_join(tfqclean,tfqoth, by = 'bigint',suffix = c('_tot','_oth'))

    # Get names 
    if(bigunit == 'cbsa') tfqclean$bigint = str_pad(tfqclean$bigint,5,pad = '0')
    if(bigunit == 'csa') tfqclean$bigint = str_pad(tfqclean$bigint,3,pad = '0')
    if(bigunit == 'cz') tfqclean$bigint = str_pad(tfqclean$bigint,5,pad = '0')

    if(bigunit == 'cbsa') tfqclean = left_join(tfqclean, msanames, by = 'bigint')
    if(bigunit == 'csa') tfqclean = left_join(tfqclean, csanames, by = 'bigint')
    if(bigunit == 'cz') tfqclean = left_join(tfqclean, cznames, by = 'bigint')
    
    # Deming NM is MSA missing tot, which makes sense bc it was missing tons of non residential properties
    filter(tfqclean, is.na(pop_tot))
    
    if(bigunit == 'cbsa') tfqclean = tfqclean %>% select(bigint,bigname,bigshortname,everything())
    if(bigunit != 'cbsa') tfqclean = tfqclean %>% select(bigint,bigname,everything())
    
    
    # Export
    write_csv(tfqclean , paste0('output/tfq/tfq_',bigunit, '_', smallunit,'.csv'))
    
    cleandta = tfqclean %>% arrange(bigint) %>% select(bigint,bigname,pop = pop_tot,nprop_all = nprop_tot,nprop_residential = nprop_res,
                                                        value_pc_all_properties = val_pc_tot,value_pc_residential = val_pc_res,tfq_all_properties = tfq_val_tot,tfq_residential = tfq_val_res)
    names(cleandta)[1] = paste0(bigunit,'_code')
    names(cleandta)[2] = paste0(bigunit,'_name')
    
    if(smallunit == 'loc2') smallname = 'gen_purpose_govts'
    if(smallunit == 'loc') smallname = 'gen_purpose_govts_incl_townships'
    if(smallunit == 'stcnty') smallname = 'counties'
    if(smallunit == 'school') smallname = 'school_districts'
    
    cleandta %>% write_csv(paste0('clean_data/metro_data_',bigunit,'_',smallname,'.csv'))
    
    

  }
}
