# Match block data with property information to fiscal units
# As described in Supplementary Appendix 1, keep only fiscal units where:
  # At least 60% of Census pop present in blocks with property information
  # At least 60% of properties have value observed
  # At least $2000 of property value per person
  # FU-wide tax rate < 15% or at least $10k property value per person

# For each FU and county x property type, get % of properties and % of blocks with over 30% of properties
    # Missing value
    # Missing taxes
    # Missing tax rate
    # With tax rate >30%

# Then get % of blocks and % of population on blocks not in CL data

# Starting from property blocks
# % houses/ag/other/tot missing value
# % "" missing taxes
# % "" missing tax rate
# % "" with tax rate over 30%
# Do averages and fraction with > 

# Then add in Census block data and get
# % of Census blocks 

library(tidyverse); options(tibble.width = Inf)


dir.create('intermediate_data/fu_prop')

# Bring in properties by block
allblk = read_csv('intermediate_data/state_blocks/st_block_all.csv')

# Bring in Block to FU crosswalk, which has total block population
load('intermediate_data/fiscal_units/fu_block_crosswalk.Rdata')

# Make a state-county geoid to loop through
exp_block$stcnty = paste0(exp_block$st, exp_block$cnty)
exp_block$stcnty_name = paste0(exp_block$COUNTY, ', ',exp_block$STATE )
exp_block$stcnty_type = 'County'
exp_block = exp_block %>% filter(st != '72') # Drop Puerto Rico

# Join to FU crosswalk
fullblk = left_join(allblk, exp_block, by = c('GEOID' = 'fips'), suffix = c('','_rawcen'))

save(fullblk, file = 'intermediate_data/state_blocks/st_block_joined.Rdata' )


nrow(exp_block) # 8.2 million

sum(is.na(exp_block$pop)) # 0
sum(is.na(fullblk$pop)) # 0

sum(fullblk$pop) #318927059
sum(exp_block$pop) #334735155


# Make flags for all the different data quality issues
# fullblk$taxrt 

fullblk = fullblk %>% mutate(
  hastax_tot = nprop_tot - mistaxrt_tot, # 
  hastax_res = nprop_res - mistaxrt_res,
  hastax_ag = nprop_ag - mistaxrt_ag,
  hastax_oth = nprop_oth - mistaxrt_oth,
  
  # Block-level tax rates
  blktx_tot = tax_tot / value_tot,
  blktx_res = tax_res / value_res,
  blktx_ag = tax_ag / value_ag,
  blktx_oth = tax_oth / value_oth,
  

)


# Make clean version of names for all three big units
msanames = read_csv('nonproprietary_data/msa_crosswalks/msa_names_2020.csv')
msanames = mutate(msanames, 
                  bigint = str_pad(CBSA.Code, 5, pad = '0'),
                  bigname = CBSA.Title,
                  bigshortname = shortname) %>% select(bigint,bigname,bigshortname)
csanames = read_csv('nonproprietary_data/msa_crosswalks/csa_names_2020.csv')
csanames = mutate(csanames, 
                  bigint = str_pad(csa, 3,pad = '0'),
                  bigname = csaname) %>% select(bigint,bigname)
cznames = read_csv('nonproprietary_data/msa_crosswalks/cznames_chetty2014.csv')
cznames = mutate(cznames, 
                 bigint = str_pad(CZ,5,pad = '0'),
                 bigname = `CZ Name`) %>% select(bigint, bigname)
cntynames = read_csv('nonproprietary_data/msa_crosswalks/US_FIPS_CountyCodes.csv')
cntynames = mutate(cntynames,
                   bigint = paste0('nc',str_pad(`FIPS Code`,5,pad = '0')),
                   bigname = paste0(`County Name`,' County, ',State),
                   bigshortname = `County Name`
) %>% select(bigint,bigname,bigshortname)
msanames = bind_rows(msanames,cntynames)



# Going to loop each type of FU and big unit
for(bigunit in c('cbsa','csa','cz')){ 
  fullblk$bigint = fullblk %>% select(all_of(bigunit)) %>% unlist()
  exp_block$bigint = exp_block %>% select(all_of(bigunit)) %>% unlist()
  
for(futype in c('loc2','loc','school','stcnty')){
  
  fullblk$fuint = fullblk %>% select(all_of(futype)) %>% unlist()
  fullblk$fuintname = fullblk %>% select(all_of(paste(futype, '_name',sep = ''))) %>% unlist()
  fullblk$futype = fullblk %>% select(all_of(paste(futype, '_type',sep = ''))) %>% unlist()
  
  exp_block$fuint = exp_block %>% select(all_of(futype)) %>% unlist()
  exp_block$fuintname = exp_block %>% select(all_of(paste(futype, '_name',sep = ''))) %>% unlist()
  
  # Get total population
  cen_collapsed = exp_block %>% group_by(st,bigint,fuint,fuintname) %>% 
    summarise(
      
      # Count of observed blocks
      cenblks = length(STATE),
      
      # Observed population
      cenpop = sum(pop,na.rm = T)
    ) %>% ungroup()
  
  # Make flag if FU crosses more than one CBSA
  cen_collapsed = cen_collapsed %>% group_by(st,fuint) %>% mutate(
    split = length(fuint) > 1
  )
  
  # Get property info
  collapsed = fullblk %>% group_by(st,bigint,fuint,fuintname,futype) %>% 
    summarise(
      
      # Count of observed blocks
      clblks = length(STATE),
      
      # Observed population
      clpop = sum(pop,na.rm = T),
      
      # Counts of blocks with at least one property of each type
      across(starts_with('nprop_'), ~ sum(.x >= 1, na.rm = T),.names = 'blks1{.col}'),
      
      # Counts of properties
      across(starts_with('nprop_'), ~ sum(.x,na.rm = T)), # count of properties
      across(starts_with('misval_'), ~ sum(.x,na.rm = T)), # count missing value
      across(starts_with('mistax_'), ~ sum(.x,na.rm = T)), # count missing tax
      across(starts_with('mistaxrt_'), ~ sum(.x,na.rm = T)), # count missing tax rate
      across(starts_with('tax30_'), ~ sum(.x,na.rm = T)), # count of properties with tax > 30%

      # Total value and taxes
      across(c(starts_with('value'),starts_with('tax'),starts_with('ctax'),res_over1m),~ sum(.x,na.rm = T)),

      # Block tax rates > 10%
      across(starts_with('blktx_'), ~ sum(.x > 0.1, na.rm = T), .names = 'flag10{.col}')
      

    )
  
  # Calculate FU-wide rates
    collapsed = mutate(collapsed, 
           
           # Value and taxes per capita
           across(c(starts_with('value_'),starts_with('tax_'),starts_with('ctax_')), ~ .x / clpop, .names = 'pc_{.col}'),
           
           # Overall tax rate by FU
           fu_taxrt_tot = tax_tot / value_tot,
           fu_taxrt_res = tax_res / value_res,
           fu_taxrt_ag = tax_ag / value_ag,
           fu_taxrt_oth = tax_oth / value_oth,
           
           # Capped tax rate by FU
           fu_ctaxrt_tot = ctax_tot / value_tot,
           fu_ctaxrt_res = ctax_res / value_res,
           fu_ctaxrt_ag = ctax_ag / value_ag,
           fu_ctaxrt_oth = ctax_oth / value_oth,
           
           # Percent of all properties in different data quality categories
           across(c(misval_tot,mistax_tot,mistaxrt_tot,tax30_tot), ~ .x / nprop_tot, .names = 'prpct_{.col}'),
           across(c(misval_res,mistax_res,mistaxrt_res,tax30_res), ~ .x / nprop_res, .names = 'prpct_{.col}'),
           across(c(misval_ag,mistax_ag,mistaxrt_ag,tax30_ag), ~ .x / nprop_ag, .names = 'prpct_{.col}'),
           across(c(misval_oth,mistax_oth,mistaxrt_oth,tax30_oth), ~ .x / nprop_oth, .names = 'prpct_{.col}'),
           
  ) 
    
    # Add on total population
    collapsed = left_join(cen_collapsed,collapsed, by = c('st','bigint','fuint','fuintname'))
    
    # Compute missing population and missing blocks
    collapsed$cenmis_blk_pct = 1 - (collapsed$clblks / collapsed$cenblks)
    collapsed$cenmis_pop_pct = 1 - (collapsed$clpop / collapsed$cenpop)
    
    # Make flags for data quality issues
    collapsed = mutate(collapsed, 
                       fl_pop40 = cenmis_pop_pct >= .4 | is.na(cenmis_pop_pct),
                       fl_mv40_tot = prpct_misval_tot >= .4 & nprop_tot > 0 & !is.na(prpct_misval_tot),
                       fl_mv40_res = prpct_misval_res >= .4 & nprop_res > 0 & !is.na(prpct_misval_res),
                       fl_mv40_ag = prpct_misval_ag >= .4 & nprop_ag > 0 & !is.na(prpct_misval_ag),
                       fl_mv40_oth = prpct_misval_oth > .4 & nprop_oth > 0 & !is.na(prpct_misval_oth),
                       fl_val2k_tot = pc_value_tot < 2000 & !is.na(pc_value_tot),
                       fl_val2k_res = pc_value_res < 2000 & !is.na(pc_value_res),
                       fl_taxgt10k_tot = pc_tax_tot > 10000 & !is.na(pc_tax_tot),
                       fl_taxgt10k_res = pc_tax_res > 10000 & !is.na(pc_tax_res),
                       fl_mtr40_tot = prpct_mistaxrt_tot > .4 & nprop_tot > 0 & !is.na(prpct_mistaxrt_tot),
                       fl_mtr40_res = prpct_mistaxrt_res > .4 & nprop_res > 0 & !is.na(prpct_mistaxrt_res),
                       fl_tr15_tot = fu_taxrt_tot > .15 & !is.na(fu_taxrt_tot),
                       fl_tr15_res = fu_taxrt_res > .15 & !is.na(fu_taxrt_res),
                       fl_hitax10_tot = prpct_tax30_tot > .1 & !is.na(prpct_tax30_tot),
                       fl_hitax10_res = prpct_tax30_res > .1 & !is.na(prpct_tax30_res) ,
                       cleansample_val_tot = fl_pop40 == F & fl_mv40_tot == F & fl_val2k_tot == F & (fl_tr15_tot == F | fl_taxgt10k_tot == T),  
                       cleansample_val_res = fl_pop40 == F & fl_mv40_res == F & fl_val2k_res == F & (fl_tr15_res == F | fl_taxgt10k_res == T), 
                       cleansample_tax_tot = cleansample_val_tot == T & fl_mtr40_tot == F & fl_tr15_tot == F, 
                       cleansample_tax_res = cleansample_val_res == T & fl_mtr40_res == F & fl_tr15_res == F 
                       
                       ) %>% ungroup()
    
    
    # Add names
    if(bigunit == 'cbsa'){
      collapsed$bigint = str_pad(collapsed$bigint,5,pad = '0')
      collapsed = left_join(collapsed, msanames, by = 'bigint')
      collapsed = collapsed %>% select(st,bigint,bigname, bigshortname,fuint,fuintname,futype,everything())
    }
    if(bigunit == 'csa'){
      collapsed$bigint = str_pad(collapsed$bigint,3,pad = '0')
      collapsed = left_join(collapsed, csanames, by = 'bigint')
      collapsed = collapsed %>% select(st,bigint,bigname,fuint,fuintname,futype,everything())
    } 
    if(bigunit == 'cz'){
      collapsed$bigint = str_pad(collapsed$bigint,5,pad = '0')
      collapsed = left_join(collapsed, cznames, by = 'bigint')
      collapsed = collapsed %>% select(st,bigint,bigname,fuint,fuintname,futype,everything())
    }
    
    write_csv(collapsed, paste0('intermediate_data/fu_prop/fu_',bigunit,'_',futype,'.csv'))
    
}

}
