# Make scatter plots of TFQ compared to economic segregation and jurisdictional fragmentation
# Also exports Table 1 in the main text giving CBSA-level summary statistics

library(tidyverse); theme_set(theme_bw())
library(ggrepel)
library(scales)

dir.create('output/prev_metrics')

bigunit = 'cbsa'
smallunit = 'loc2'

# Bring in segregation and TFQ
seg = read_csv('output/econseg/econseg_cbsas.csv')
tfq = read_csv(paste0('output/tfq/tfq_',bigunit,'_',smallunit,'.csv'))

# Bring in counts of tax havens and deserts
fudta = read_csv(paste0('output/tfq/tfq_fu/fu_big_',bigunit,'_', smallunit,'_tot.csv'))
fudta = mutate(fudta, 
               valpc_pct = valpc / bigpc,
                 over1k = clpop >= 50000,
               low_rel = valpc_pct <= 1/3,
               hi_rel = valpc_pct >= 3
)
fucbsa = fudta %>% group_by(bigint) %>% summarise(
  munihavens = sum(hi_rel),
  fiscimpov = sum(low_rel)
)


# Join data
dta = left_join(tfq, seg %>% filter(valtype == 'all') %>% select(-valtype), by = c('bigint'='cbsa'))
dta = left_join(dta, seg %>% filter(valtype == 'res') %>% select(-valtype), by = c('bigint'='cbsa'),suffix = c('_tot','_res'))
dta = left_join(dta, fucbsa, by = 'bigint')
dta = filter(dta, !is.na(bigint))

# Create table 1: top and bottom 10 CBSAs
top10 = dta %>% filter(pop_tot >= 500000) %>% arrange(-tfq_val_tot) %>% select(bigint,bigname,pop_tot,nprop_tot,val_pc_tot,tfq_val_tot,ngovts_tot,govts100k_tot,econseg_tot,econseg_res,munihavens,fiscimpov)
top10 = top10[1:10,]

bottom10 = dta %>% filter(pop_tot >= 500000) %>% arrange(tfq_val_tot) %>% select(bigint,bigname,pop_tot,nprop_tot,val_pc_tot,tfq_val_tot,ngovts_tot,govts100k_tot,econseg_tot,econseg_res,munihavens,fiscimpov)
bottom10 = bottom10[1:10,]

table1 = bind_rows(top10, bottom10 %>% arrange(-tfq_val_tot))

write_csv(table1, paste0('output/prev_metrics/table_1_tfq_',bigunit,'_',smallunit,'.csv'))


# Export summary data for all large metros
largemetros = dta %>% filter(pop_tot >= 500000) %>% arrange(-tfq_val_tot) %>% select(bigint,bigname,pop_tot,val_pc_tot,val_pc_res,nprop_tot,tfq_val_tot,tfq_val_res,ngovts_tot,govts100k_tot,econseg_tot,econseg_res,munihavens,fiscimpov)
write_csv(largemetros, paste0('output/prev_metrics/large_metros_tfq',bigunit,'_',smallunit,'.csv'))

#Export summary data for all metros
write_csv(dta %>% arrange(-tfq_val_tot) %>% select(bigint,bigname,pop_tot,val_pc_tot,tfq_val_tot,ngovts_tot,govts100k_tot,econseg_tot,econseg_res,munihavens, fiscimpov), paste0('output/prev_metrics/all_metros_tfq',bigunit,'_',smallunit,'.csv'))


# Output clean metro level data for circulating
cleandta = dta %>% arrange(-bigint) %>% select(bigint,bigname,pop = pop_tot,nprop = nprop_tot,value_pc_all_properties = val_pc_tot,value_pc_residential = val_pc_res,tfq_all_properties = tfq_val_tot,tfq_residential = tfq_val_res,num_govts = ngovts_tot,govts_per_100k = govts100k_tot,econseg_all_properties = econseg_tot,econseg_residential = econseg_res,num_tax_havens = munihavens,num_fisc_impov = fiscimpov)
names(cleandta)[1] = paste0(bigunit,'_code')
names(cleandta)[2] = paste0(bigunit,'_name')

if(smallunit == 'loc2') smallname = 'gen_purpose_govts'
if(smallunit == 'loc') smallname = 'gen_purpose_govts_incl_townships'
if(smallunit == 'stcnty') smallname = 'counties'

cleandta %>% write_csv(paste0('clean_data/',bigunit,'_summary_',smallname,'.csv'))




## Graph of econ seg and jurisdictions versus TFQ for appendix
pdta = dta %>% filter(!is.na(econseg_tot), !is.na(govts100k_tot)) %>% mutate(lgovts100k = log(govts100k_tot)) %>% pivot_longer(cols = c(econseg_tot,lgovts100k))
pdta$cleanname = factor(pdta$name, levels = c('econseg_tot','lgovts100k'),
                        labels = c('A. Economic segregation', 'B. Jurisdictional fragmentation'))

my_breaks <- function(x) { if (max(x,na.rm = T) < 1) c(0,.2,.4,.6) else c(log(.1),log(1),log(10),log(100)) }
my_labels <- function(x) { if (max(x,na.rm = T) < 1) c(0,.2,.4,.6) else c(0.1,1,10,100) }

pdf(paste0('output/prev_metrics/sc_fa5.1_',bigunit,'_',smallunit,'_combined.pdf'),height = 5) 
gplt = ggplot(pdta, aes(x = value, y = tfq_val_tot, size = pop_tot)) + 
  geom_point(alpha = .12) + 
  geom_text_repel(data = filter(pdta, pop_tot > 6000000 | (tfq_val_tot > .2 & pop_tot > 1000000) | (tfq_val_tot > .3 & pop_tot > 100000) | tfq_val_tot > .41 & pop_tot > 40000), 
                  aes(label = bigname),size = 1.45,segment.size = .15,seed = 1234 ,min.segment.length = .1) +
  facet_wrap(. ~ cleanname,scales = 'free_x') + 
  geom_hline(yintercept = 0, alpha =.4) +
  
  scale_y_continuous(name = 'Tax base fragmentation quotient') + 
  scale_x_continuous(name = "Rank-order info. theory index               Gov'ts per 100k pop. (log scale)", breaks = my_breaks, labels = my_labels)  +
  scale_size_continuous(name = 'Population',labels = comma,range = c(0,6)) 

print(gplt)
dev.off()

# Same graph but residential property only
pdta = dta %>% filter(!is.na(econseg_res), !is.na(govts100k_res)) %>% mutate(lgovts100k = log(govts100k_res)) %>% pivot_longer(cols = c(econseg_res,lgovts100k))
pdta$cleanname = factor(pdta$name, levels = c('econseg_res','lgovts100k'),
                        labels = c('A. Economic segregation', 'B. Jurisdictional fragmentation'))

my_breaks <- function(x) { if (max(x,na.rm = T) < 1) c(0,.2,.4,.6) else c(log(.1),log(1),log(10),log(100)) }
my_labels <- function(x) { if (max(x,na.rm = T) < 1) c(0,.2,.4,.6) else c(0.1,1,10,100) }

pdf(paste0('output/prev_metrics/sc_fa5.2_',bigunit,'_',smallunit,'_combined_res.pdf'),height = 5) 
gplt = ggplot(pdta, aes(x = value, y = tfq_val_res, size = pop_res)) + 
  geom_point(alpha = .12) + 
  geom_text_repel(data = filter(pdta, pop_res > 6000000 | (tfq_val_res > .2 & pop_res > 1000000) | (tfq_val_res > .3 & pop_res > 100000) | tfq_val_res > .41 & pop_res > 40000), 
                  aes(label = bigname),size = 1.45,segment.size = .15,seed = 1234 ,min.segment.length = .1) +
  facet_wrap(. ~ cleanname,scales = 'free_x') + 
  geom_hline(yintercept = 0, alpha =.4) +
  scale_y_continuous(name = 'TFQ - residential property only') + 
  scale_x_continuous(name = "Rank-order info. theory index               Gov'ts per 100k pop. (log scale)", breaks = my_breaks, labels = my_labels)  +
  scale_size_continuous(name = 'Population',labels = comma,range = c(0,6)) 

print(gplt)
dev.off()
