# Calculate statistics of average trends in divergence using IPUMS mean and median income data
# For various samples

library(dplyr)
library(ggplot2)
library(Hmisc)
library(scales)
library(reshape2)
library(ggthemes)
library(maps)
library(stringr)

setwd('~/projects/regionineq/')

#Crosswalk from MSAs to counties in the maps
msas = read.csv('data/mable/cntymsakey.csv')
msas$reg = msas$polyname

cntys = map_data('county')
cntys$reg = paste(cntys$region, cntys$subregion, sep = ',')
cntys = merge(cntys,msas,by = 'reg')

#Bring in CZ names from Chetty et al 2014 online data tables
cznames = read.csv('data/cz/cznames_chetty2014.csv')
cznames$czone = str_pad(cznames$CZ,5,pad = '0')
cznames$czname = paste(cznames$CZ.Name,cznames$State,sep = ', ')
cznames = select(cznames, czone, czname)

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('CBSA','Commuting Zone')

for(i in 1:length(varnames)){  #2){ # 
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  cntys$city = cntys[,vname]
  # cntys$cityname = cntys[,]
  
  natldta = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo.csv',sep = ''))
  natldta$year = natldta$fullyr
  # natldta$med = natldta$median
  
  snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
  tnames = c('family','sqrt-normalized family','normalized family','household','sqrt-normalized household','normalized household','adult individual','adult male','adult female')
  
  dir.create(paste('output/ipums/',fname,'/devstats/',sep = ''))
  
  cstats = NULL
  betaconv = NULL
  
  for(s in 1:length(snames)) {
    sname = snames[s]
    tname = tnames[s]
    dir.create(paste('output/ipums/',fname,'/devstats/',sname,sep = ''))
    print(paste(vname,sname))
    
    #Get all years in one frame
    allyrs = NULL
    for(yr in c('80','90','00','08','13')) {
      dta =  read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_',yr,'.csv',sep = ''))
      yrn = as.numeric(gsub('a|b','',yr))
      if(yrn >= 50) yrn = 1900 + yrn
      if(yrn <50) yrn = 2000 + yrn

      dta$year = yrn
      allyrs = rbind(allyrs, dta)
    }
  
    #Get national level data
    ndta = filter(natldta, samp == sname)
    dta = merge(allyrs, ndta[,c('fullyr','mean','median')], by.x = 'year',by.y = 'fullyr',all.x = T,suffixes = c('','_nat'))
    
    # Drop cities missing data in 1980 to conform to counterfactuals
    if(vname == 'czone'){
      filter(dta, city == '34101',year == '1980') #None
      dta = filter(dta, city != '34101')
    }
    if(vname == 'msa'){
      filter(dta, city == '24380',year == '1980') 
      dta = filter(dta, city != '24380')
    }
    
    
    stattypes = c('median','mean')
    for(st in stattypes){
      
      dta$stat = dta[,st]
      dta$stat_nat = dta[,paste(st,'_nat',sep = '')]
      
      #Compute statistics
      dta$rel = dta$stat / dta$stat_nat
      dta$log = log(dta$stat)
      dta$cat = cut(dta$rel,breaks = c(0,.8,.9,1.1,1.2,10000), labels = c('<80%','80-90%','90-110%','110-120%','>120%'))

      #Statistics over time
      cstat = dta %>% group_by(year) %>% summarise(
        wsd = sqrt(wtd.var(rel, weights = cnt)),
        wsdlog = sqrt(wtd.var(log, weights = cnt)),
        wp10 = wtd.quantile(rel,probs = .1,weights = cnt),
        wp25 = wtd.quantile(rel,probs = .25,weights = cnt),
        wp50 = wtd.quantile(rel,probs = .5,weights = cnt),
        wp75 = wtd.quantile(rel,probs = .75,weights = cnt),
        wp90 = wtd.quantile(rel,probs = .9,weights = cnt),
        natstat = mean(stat_nat)) %>% data.frame()
    
      #Make IQR, 10-50-90 ratios, etc
      cstat$wiqr = cstat$wp75 - cstat$wp25
      cstat$wd19 = cstat$wp90 - cstat$wp10
      cstat$wr95 = cstat$wp90 / cstat$wp50
      cstat$wr51 = cstat$wp50 / cstat$wp10
      cstat$wr15 = cstat$wp10 / cstat$wp50
      cstat$wr55 = cstat$wp50 / cstat$wp50
      cstat$wr91 = cstat$wp90 / cstat$wp10
      
      #Labels for output
      cstat$samp = sname
      cstat$stat = st
      
      cstats = rbind(cstats, cstat)
  
      #Do 10-50 and 50-90 ratios over time
      titname = paste('90-50 and 50-10 ratios of',cityname,tname,st,'income,\nover time, weighted by population')
      ratvars = paste('wr',c('55','15','95'),sep = '')
      cmelt = melt(cstat[,c('year',ratvars)], id.vars = 'year')
      
      pdf(paste('output/ipums/',fname,'/devstats/',sname,'/ipums_p159_',sname,'_',st,'.pdf',sep = ''),height = 5)
      gplt = ggplot(cmelt, aes(x = year, y = value, group = variable, lty = variable)) +
        geom_line() +
        geom_point() +
        theme_bw() +
        xlab('Year') +
        theme(legend.position = 'bottom') +
        scale_linetype_discrete(breaks = ratvars[c(2,1,3)], labels = c('10th percentile','Median','90th percentile'),name = '') +
        scale_y_continuous(limits = c(.5,1.5), name = paste("Percent of median",cityname), labels = percent) 
        # ggtitle(titname)
      print(gplt)
      dev.off()
      
    
    
    
    #Beta convergence
    dta80 = filter(dta, year == 1980)[,c('city','cnt','stat')]
    dtabeta = merge(dta,dta80,by = 'city',suffixes = c('','_80'))
    
    dtabeta$anngr = (dtabeta$stat / dtabeta$stat_80) ^(1/33) - 1
    dta13 = filter(dtabeta, year == 2013)
    
    #Fit linear model
    mod1 = lm(formula = anngr ~ stat_80,weights = cnt_80,data = dta13) #Note I drop if they had literally zero median income in 1980
    betas = c(sname,st,mod1$coefficients[2])
    betaconv = rbind(betaconv, betas)
    
    #Plot annual growth rate vs initial level
    pdf(paste('output/ipums/',fname,'/devstats/',sname,'/ipums_beta_',sname,'_',st,'.pdf',sep = ''),height = 6)
    gplt = ggplot(dta13, aes(x = stat_80, y = anngr,size = cnt_80)) +
      geom_point(alpha = .3) +
      theme_bw() +
      scale_x_continuous(labels = dollar, name = paste('1980',cityname,st,tname,'income'))+
      scale_y_continuous(labels = percent, name = paste('Annualized growth rate in',st,'income, 1980-2013')) +
      geom_abline(intercept = mod1$coefficients[1], slope = mod1$coefficients[2],lty = 2) +
      scale_size_continuous(range = c(0,5), labels = comma, name = paste('1980',cityname, '\npopulation')) +
      # ggtitle(paste('Beta divergence of',cityname,st,tname,'income\n1980-2013'))
      theme(legend.position = 'bottom')
    
    print(gplt)
    dev.off()
    
    #Make graph of relative income in 1980 and 2013 
      d80 = dta %>% filter(year == 1980)
      d13 = dta %>% filter(year == 2013)
      dchange = merge(d80, d13, by = 'city', all = T, suffixes = c('_80','_13'))
      filter(dchange,is.na(rel_80)) #One city 34101 for mean family income, very small
      filter(dchange,is.na(rel_13)) #No cities for mean family income
      
      #Add on names
      # dchange$czone = str_pad(dchange$city,5,pad = '0')
      # dchange = merge(dchange, cznames, by = 'czone',all.x = T)
      
      #Only label places > 3 million in 2013
      dlabs = filter(dchange, cnt_13 >3500000)
      
      pdf(paste('output/ipums/',fname,'/devstats/',sname,'/ipums_changerel_',sname,'_',st,'.pdf',sep = ''),height = 6)
      gplt = ggplot() +
        geom_point(data = dchange, aes(x = rel_80, y = rel_13, size = cnt_13),alpha = .3) +
        # geom_smooth(data = cd, aes(x = gdprel_7, y = gdprel_1, weight = pop_1),alpha = .001, lty = 2, color = 'darkgray', se = F) +
        geom_text(data = dlabs, aes(x = rel_80+.013, y = rel_13, label = cityname_13),vjust = "middle", hjust = "left", size=  1.7)+
        scale_size_continuous(range = c(0,6), name = '2013 Population:', labels = comma) +
        geom_abline(slope = 1, intercept = 0, lty = 3) +
        # ggtitle(paste('Change in',cityname, st,tname,'income\nrelative to the nation, 1980-2013'))+
        scale_x_continuous(limits = c(0.45,1.65), labels = percent, name = paste(str_to_title(st),tname,'income relative to nation, 1980')) +
        scale_y_continuous(limits = c(0.45,1.65), labels = percent, name = paste(str_to_title(st),tname,'income relative to nation, 2013')) +
        theme_bw()+
        theme(legend.position = 'bottom')
      print(gplt)
      dev.off()
    
    
    #Map data
    for(yr in c(1980,1990,2000,2008,2013)){
      mdtayr = filter(dta, year == yr) 
      mapdat = merge(cntys,mdtayr,by = 'city', all.x = T)
      mapdat = mapdat %>% arrange(order)
      # maptit = paste('Observed',cityname,st,tname,'income relative to nation,',yr)
      maptit = ''
      if(yr == 1980) maptit = "A: 1980"
      if(yr == 2013) maptit = "B: 2013"
      
      #Categorical map
      pdf(paste('output/ipums/',fname,'/devstats/',sname,'/map_',sname,'_',st,'_',yr,'.pdf',sep = ''),height = 4.5)
      gplot = ggplot(data = mapdat, aes(long, lat, group = group)) +
        geom_polygon( aes(fill = cat)) +
        coord_map("polyconic") +
        scale_fill_grey(start = .9,end = .1,name ='Mean family income\nrelative to nation') +
        # scale_fill_manual(values = colorRampPalette(c('darkgreen','grey60','midnightblue'))(7),name= '') +
        # scale_fill_gradient2(midpoint = 1, label = percent, name = '', limits = c(0,2), low = 'midnightblue',mid = 'grey',high = 'darkgreen') +
        # scale_fill_distiller(palette = 'PRGn',direction = 1, label = percent, name = '')+
        # (palette = 'Blue', label = percent, name = '')+
        theme_map() +
        theme(legend.position = 'bottom',legend.justification = 'center') +
      # guides(fill = guide_legend(ncol = 7)) #+
        ggtitle(maptit)
      print(gplot)
      dev.off()
      }
    } #stat
    
   
    
  } #sample
  betaconv = data.frame(betaconv)
  names(betaconv)  =c('samp','stat','beta')
  write.csv(betaconv,paste('output/ipums/',fname,'/devstats/beta_',fname,'.csv',sep = ''),row.names = F)
  write.csv(cstats,paste('output/ipums/',fname,'/devstats/dev_',fname,'.csv',sep = ''),row.names = F)
} #city
