# Do analysis of per capita personal income for Commuting Zones and MSAs
# To produce the graphs in supplement section 2

library(dplyr)
library(ggplot2)
library(Hmisc)
library(scales)
library(reshape2)

setwd('~/projects/regionineq/')

#Make overall output directory
dir.create('output/bea')

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('CBSA','Commuting Zone')

for(i in 1:length(varnames)){  
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  #Make output directory
  dir.create(paste('output/bea/',fname,sep = ''))
  dir.create(paste('output/bea/',fname,'/devstats',sep = ''))

  #Bring in per capita income data from bea_make_cities.R
  cdta = read.csv(paste('output/bea/dta/',fname,'_gdppc.csv', sep = ''))
  cdta$city = cdta[,vname]
  cdta$cityname = cdta[,cname]
  
  #If MSA, drop one MSA in New Mexico with no population in early years (because county was created in 1981)
  if(fname == 'msa') cdta = filter(cdta, city != '24380')  

  #If CZ, drop two CZs in Alaska that don't have data in early years
  if(fname == 'cz') cdta = filter(cdta, city != '34101', city != '34107')
  stopifnot(!is.na(cdta$gdppc))
  
  #Turn inccat into properly ordered factor
  cdta$inccat = factor(cdta$inccat, 
      levels = c('<80%','80-90%','90-110%','110-120%','>120%'))
  
  #Make log income
  cdta$loginc = log(cdta$gdppc)
  cdta$lognat = log(cdta$gdppcnat)

  #Statistics over time
  cstat = cdta %>% group_by(year) %>% summarise(
    lognat = mean(lognat), #Don't need to weight because it's the across all regions for a given year
    wsdlog = sqrt(wtd.var(loginc, weights = pop)),
    wsd = sqrt(wtd.var(gdprel, weights = pop)),
    wp10 = wtd.quantile(gdprel,probs = .1,weights = pop),
    wp25 = wtd.quantile(gdprel,probs = .25,weights = pop),
    wp50 = wtd.quantile(gdprel,probs = .5,weights = pop),
    wp75 = wtd.quantile(gdprel,probs = .75,weights = pop),
    wp90 = wtd.quantile(gdprel,probs = .9,weights = pop),
    natmean = mean(gdppcnat)) %>% data.frame()
  
  #Make IQR, 10-90 range, 10-50-90 ratios
  cstat$wiqr = cstat$wp75 - cstat$wp25
  cstat$wd19 = cstat$wp90 - cstat$wp10
  cstat$wr95 = cstat$wp90 / cstat$wp50
  cstat$wr51 = cstat$wp50 / cstat$wp10
  cstat$wr15 = cstat$wp10 / cstat$wp50
  cstat$wr55 = cstat$wp50 / cstat$wp50
  cstat$wlogrel = cstat$wsdlog / cstat$lognat
  
  #Export
  write.csv(cstat,paste('output/bea/',fname,'/devstats/bea_',fname,'_devstats.csv',sep = ''),row.names = F)
  
  #Do 10-50 and 50-90 ratios over time
  ratvars = paste('wr',c('55','15','95'),sep = '')
  cmelt = melt(cstat[,c('year',ratvars)], id.vars = 'year')
  
  pdf(paste('output/bea/',fname,'/devstats/bea_p159.pdf',sep = ''), height = 6)
  gplt = ggplot(cmelt, aes(x = year, y = value, group = variable, lty = variable)) +
   geom_line() +
   theme_bw() +
   xlab('Year') +
   scale_linetype_discrete(breaks = ratvars[c(2,1,3)], labels = c('10th percentile','Median','90th percentile'),name = '') +
   scale_y_continuous(limits = c(.5,1.5), name = paste("Percent of median",cityname,"\nper capita personal income"), labels = percent) +
   # ggtitle(titname) +
   theme(legend.position = 'bottom')
  print(gplt)
  dev.off()
   
  
  #Calculate percent of country in each income category over time
  inccats = cdta %>% group_by(year, inccat) %>% summarise(
    pop = sum(pop)/mean(popnat),
    gdp = sum(gdp)/mean(gdptotnat),
    cnt = length(inccat)/length(unique(cdta$city)) ) %>% data.frame()
  
  #Export
  write.csv(inccats, paste('output/bea/',fname,'/devstats/bea_',fname,'_inccats.csv',sep = ''), row.names = F)
  
  
  #Make trajectories of individual cities
  citylabs = filter(cdta, year == 2015, pop > 3000000)
  
  pdf(paste('output/bea/',fname,'/devstats/bea_traj_',fname,'_gdprel.pdf',sep = ''),height = 6)
  gplt = ggplot(cdta %>% arrange(-gdprel), aes(x = year, y = gdprel, group = city)) +
          geom_line(aes( alpha = pop)) +
          theme_bw() +
          scale_x_continuous(name = "Year", limits = c(min(cdta$year),2020)) +
          scale_alpha_continuous(name = 'Population in 2015', labels = comma, range = c(0,1)) +
          # ggtitle(paste(cityname,'per capita personal income relative to nation over time')) +
          geom_text(data = citylabs, aes(x = 2015.5, y = gdprel, label = cityname),hjust = 0, size = 1.75)+
          theme(legend.position = 'bottom')+
          scale_y_continuous(labels = percent, limits=c(.5,1.75), name = 'Per capita personal income relative to nation')
  print(gplt)
  dev.off()
  
  #Do GDP relative to country in 1980 vs 2013 - figure S3
  c80 = filter(cdta, year == 1980)
  c13 = filter(cdta, year == 2013)
  cd = merge(c80, c13, by = 'city', suffixes = c('_8',"_1"))
  clabs = filter(cd, pop_1 >3000000)
  
  pdf(paste('output/bea/',fname,'/devstats/bea_changerel_',fname,'5in.pdf',sep = ''),height = 5)
  gplt = ggplot() +
    geom_point(data = cd, aes(x = gdprel_8, y = gdprel_1, size = pop_1),alpha = .3) +
    geom_text(data = clabs, aes(x = gdprel_8+.013, y = gdprel_1, label = cityname_1),vjust = "middle", hjust = "left", size=  1.7)+
    scale_size_continuous(range = c(0,6), name = '2013 Population:', labels = comma) +
    geom_abline(slope = 1, intercept = 0, lty = 3) +
    # ggtitle(paste('Change in',cityname, 'per capita personal income\nrelative to the nation, 1980-2013'))+
    scale_x_continuous(limits = c(0.45,1.65), labels = percent, name = 'Per capita income relative to nation, 1980') +
    scale_y_continuous(limits = c(0.45,1.65), labels = percent, name = 'Per capita income relative to nation, 2013') +
    theme_bw()+
    theme(legend.position = 'bottom')
  
  print(gplt)
  dev.off()
  
  #Do beta divergence - Figure S4
  dta80 = filter(cdta, year == 1980)[,c('city','pop','gdppc')]
  dtabeta = merge(cdta,dta80,by = 'city',suffixes = c('','_80'))

  dtabeta$anngr = (dtabeta$gdppc / dtabeta$gdppc_80) ^(1/33) - 1
  dta13 = filter(dtabeta, year == 2013)
  
  #Fit linear model
  mod1 = lm(formula = anngr ~ gdppc_80,weights = pop_80,data = dta13) #Note I drop if they had literally zero per capita income in 1980
  # betas = c(sname,st,mod1$coefficients[2])
  # betaconv = rbind(betaconv, betas)
  
  #Plot annual growth rate vs initial level
  pdf(paste('output/bea/',fname,'/devstats/bea_beta_',fname,'.pdf',sep = ''),height = 6)
  gplt = ggplot(dta13, aes(x = gdppc_80, y = anngr,size = pop_80)) +
    geom_point(alpha = .3) +
    theme_bw() +
    scale_x_continuous(labels = dollar, name = paste('1980',cityname,'per capita personal income'))+
    scale_y_continuous(labels = percent, name = paste('Annualized growth rate of per capita personal\nincome, 1980-2013')) +
    geom_abline(intercept = mod1$coefficients[1], slope = mod1$coefficients[2],lty = 2) +
    scale_size_continuous(range = c(0,5), labels = comma, name = paste('1980',cityname, '\npopulation')) +
    # ggtitle(paste('Beta convergence of',cityname,'per capita personal income\n1980-2013'))
    theme(legend.position = 'bottom')
  
  print(gplt)
  dev.off()
  
  
}
