# Try dropping top 1, 5, and 10 percent and recomputing divergence statistics
# Creates Figure 5

library(dplyr)
library(Hmisc)
library(stringr)
library(RColorBrewer)
library(scales)
library(ggplot2)
library(reshape2)
library(ggthemes)

setwd('~/projects/regionineq/')

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('CBSA','Commuting Zone')

#Bring in national means

for(i in 1:length(varnames)){  # 2){ #  
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  natmeans = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo.csv',sep = ''))
  natmeans$med = natmeans$median
  natmeans$year = natmeans$fullyr
  
  snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
  tnames = c('family','sqrt-normalized family','normalized family','household','sqrt-normalized household','normalized household','adult individual','adult male','adult female')
  
  dir.create(paste('output/ipums/',fname,'/droptop',sep = ''))
  
  dtsig = NULL
  dtbeta = NULL
  
  for(s in 1:length(snames)) {
    sname = snames[s]
    tname = tnames[s]
    
    dir.create(paste('output/ipums/',fname,'/droptop/',sname,sep = ''))
      
    #Get overall national mean and median
    natstats = filter(natmeans,samp == sname)
    
    #Get city mean and median
    citym80 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_80.csv',sep = ''))
    citym80$year = 1980
    citym90 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_90.csv',sep = ''))
    citym90$year = 1990
    citym00 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_00.csv',sep = ''))
    citym00$year = 2000
    citym10 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_08.csv',sep = ''))
    citym10$year = 2008
    citym15 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_13.csv',sep = ''))
    citym15$year = 2013
    
    citymeans = rbind(citym80, citym90, citym00, citym10, citym15)
    citymeans$med = citymeans$median
    citymeans$pop = citymeans$cnt

    #Make dataframe to store national information from each sample
    natinfo80 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_1980.csv',sep = ''))
    natinfo90 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_1990.csv',sep = ''))
    natinfo00 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_2000.csv',sep = ''))
    natinfo10 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_2008.csv',sep = ''))
    natinfo15 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_2013.csv',sep = ''))
    
    natinfo = rbind(natinfo80,natinfo90,natinfo00, natinfo10, natinfo15)
    
    #Make city year by year data frame
    city80 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_1980.csv',sep = ''))
    city90 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_1990.csv',sep = ''))
    city00 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_2000.csv',sep = ''))
    city10 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_2008.csv',sep = ''))
    city15 = read.csv(paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_2013.csv',sep = ''))

    city = rbind(city80, city90, city00, city10, city15)
    
    #Add city variable
    city$city = as.character(city[,fname])
    
    #Do mean and median
    stattypes = c('med','mean')
    statnames = c('median','mean')
    for(statnum in 1:length(stattypes)){
      st = stattypes[statnum]
      stname = statnames[statnum]
      
      print(paste(vname,sname,stname))
      
      #National percentiles
      natinfo$stat = natinfo[,st]
      
      #City percentiles
      city$stat = city[,st]
      
      #Overall national statistic
      natcalc = natstats
      natcalc$natstat = natcalc[,st]

      #Overall city statistic
      citycalc = citymeans
      citycalc$stat = citymeans[,st]
      
      for(droptop in c(1,5,10)){
          
        #Make grouped percentiles
        natdropped = filter(natinfo, p <= 100-droptop)
        stopifnot(nrow(natdropped) == nrow(natinfo) - 5 * droptop) #Make sure dropping right number of observations
  
        #Make national means for each year after dropping
        if(st == 'mean'){
          natmeansdr = natdropped %>% group_by(year) %>% summarise(stat = wtd.mean(stat, weights = cnt)) %>% data.frame()
        }
        if(st == 'med'){
          natmeansdr = natdropped %>% group_by(year) %>% summarise(natstat = wtd.quantile(stat, weights = cnt,probs = .5)) %>% data.frame()
        }
        names(natmeansdr) = c('year',paste('natstat_dr',droptop,sep = ''))
        
        #Combine with overall means 
        natcalc = merge(natcalc,natmeansdr,by = 'year')
        
        #Make city means for each year after dropping
        citydropped = filter(city, p <= 100-droptop)
        if(st == 'mean'){
          citymeansdr = citydropped %>% group_by(city, year) %>% summarise(pop = sum(cnt), stat = wtd.mean(stat, weights = cnt)) %>% data.frame()
        }
        if(st == 'med'){
          citymeansdr = citydropped %>% group_by(city, year) %>% summarise(pop = sum(cnt), stat = wtd.quantile(stat, weights = cnt,probs = .5)) %>% data.frame()
        }
        names(citymeansdr) = c('city','year',paste('pop_dr',droptop,sep = ''),paste('stat_dr',droptop,sep = ''))
      
        #Combine with overall means
        citycalc = merge(citycalc, citymeansdr, by = c('city','year'))
      }
    
    #Compute city means relative to national means
      citycalc = merge(citycalc, natcalc[,c('year',grep('natstat',names(natcalc),value = T))], by = 'year')
      citycalc$rel = citycalc$stat / citycalc$natstat
    for(droptop in c(1,5,10)){
      citycalc[,paste('rel_dr',droptop,sep = '')] = citycalc[,paste('stat_dr',droptop,sep = '')] / citycalc[,paste('natstat_dr',droptop,sep = '')] 
    }
      
    #Now compute variation statistics dropping various amounts
    citystats = citycalc %>% group_by(year) %>% 
      summarise(
        sd = sqrt(wtd.var(rel,weights = pop)),
        sd1 = sqrt(wtd.var(rel_dr1,weights = pop_dr1)),
        sd5 = sqrt(wtd.var(rel_dr5,weights = pop_dr5)),
        sd10 = sqrt(wtd.var(rel_dr10,weights = pop_dr10)),
        p10d = wtd.quantile(rel,weights = pop, probs = .1),
        p10d1 = wtd.quantile(rel_dr1,weights = pop_dr1, probs = .1),
        p10d5 = wtd.quantile(rel_dr5,weights = pop_dr5, probs = .1),
        p10d10 = wtd.quantile(rel_dr10,weights = pop_dr10, probs = .1),
        p25d = wtd.quantile(rel,weights = pop, probs = .25),
        p25d1 = wtd.quantile(rel_dr1,weights = pop_dr1, probs = .25),
        p25d5 = wtd.quantile(rel_dr5,weights = pop_dr5, probs = .25),
        p25d10 = wtd.quantile(rel_dr10,weights = pop_dr10, probs = .25),
        p75d = wtd.quantile(rel,weights = pop, probs = .75),
        p75d1 = wtd.quantile(rel_dr1,weights = pop_dr1, probs = .75),
        p75d5 = wtd.quantile(rel_dr5,weights = pop_dr5, probs = .75),
        p75d10 = wtd.quantile(rel_dr10,weights = pop_dr10, probs = .75),
        p90d = wtd.quantile(rel,weights = pop, probs = .9),
        p90d1 = wtd.quantile(rel_dr1,weights = pop_dr1, probs = .9),
        p90d5 = wtd.quantile(rel_dr5,weights = pop_dr5, probs = .9),
        p90d10 = wtd.quantile(rel_dr10,weights = pop_dr10, probs = .9)
      ) %>% data.frame()
    
    #Compute nonparametric measures from percentiles calculated above
    for(droptop in c('',1,5,10)){
      citystats[,paste('iqrd',droptop,sep = '')] = citystats[,paste('p75d',droptop,sep = '')] - citystats[,paste('p25d',droptop,sep = '')]
      citystats[,paste('p19d',droptop,sep = '')] = citystats[,paste('p90d',droptop,sep = '')] - citystats[,paste('p10d',droptop,sep = '')] 
    }
    
    #Plot various ratios over time
    varis = c('sd','iqrd','p19d')
    varinames = c('Coefficient of variation','Inter-quartile range','90-10 range')
    
    sigstats = citystats[,c('year',
        grep('sd',names(citystats),value = T),
        grep('iqr',names(citystats),value = T),
        grep('p19',names(citystats),value = T))]
    sigstats$samp = sname
    sigstats$stat = st
    dtsig = rbind(dtsig, sigstats)
    
    for(vi in 1:length(varis)){
      vari = varis[vi]
      variname = varinames[vi]
      pdta = citystats[,c('year',grep(vari,names(citystats),value = T))]
      pdta = melt(pdta, id.vars = 'year')
      pdta$variable = factor(pdta$variable,levels = paste(vari,c('',1,5,10),sep = ''),labels = c('Full sample',paste('Dropping top ',c(1,5,10),'%',sep = '')))

      #Do raw
      pdf(paste('output/ipums/',fname,'/droptop/',sname,'/dt_sigma_',sname,'_',st,'_',vari,'.pdf',sep = ''),height = 6)
      gplt = ggplot(pdta, aes(x = year, y = value, lty = variable)) +
        geom_line() +
        geom_point() +
        theme_bw() +
        scale_linetype_discrete(name = '') +
        scale_x_continuous(name = 'Year') +
        scale_y_continuous(name = variname,limits = c(0,1.1* max(pdta$value))) +
        theme(legend.position = 'bottom') # +
        # ggtitle(paste('Cross-',cityname,' ',variname,' of ',stname,' ',tname,'\nincome over time, dropping top income groups',sep = ''))
      print(gplt)
      dev.off()
      
      #Do growth relative to self in 1980
      pdta80 = filter(pdta,year == 1980)
      ndta = merge(pdta,pdta80,by = c('variable'),suffixes = c('','_80'))
      ndta$growth = ndta$value / ndta$value_80
      
      #Make plot
      pdf(paste('output/ipums/',fname,'/droptop/',sname,'/dt_sigmagr_',sname,'_',st,'_',vari,'_',sname,'.pdf',sep = ''),height = 6)
      gplt = ggplot(ndta, aes(x = year, y = growth, lty = variable, group = variable)) +
        geom_line() +
        geom_point() +
        theme_bw() +
        scale_linetype_discrete(name = '') +
        scale_x_continuous(name = 'Year') +
        scale_y_continuous(name = paste(variname,'\nrelative to 1980 value'),labels = percent,limits = c(.9*min(ndta$growth),1.1* max(ndta$growth))) +
        theme(legend.position = 'bottom') # +
      # ggtitle(paste('Growth in cross-',cityname,' ',variname,' of\n',stname,' ',tname,' income over time, dropping top income groups',sep = ''))
      print(gplt)
      dev.off()
    }
    
    #Now do beta convergence -- annual growth rate 1980-2013 vs level 2013
    city1980 = filter(citycalc, year == 1980)
    citygr = merge(citycalc, city1980, by = 'city', suffixes = c('','_1980'))
    citygr = filter(citygr, year == 2013)
    
    dts = c('','_dr1','_dr5','_dr10')
    dtnames = c('',', dropping top 1%',', dropping top 5%',', dropping top 10%')
    betas = c(sname,st)
    
    for(dt in 1:length(dts)){ 
      droptop = dts[dt]
      dtname = dtnames[dt]
      citygr$curstat = citygr[,paste('stat',droptop,sep = '')]
      citygr$stat1980 = citygr[,paste('stat',droptop,'_1980',sep = '')]
      citygr$curpop = citygr[,paste('pop',droptop,sep = '')]
      
      citygr$anngr = (citygr$curstat / citygr$stat1980) ^ (1/(2013-1980)) - 1
    
      #Fit linear model
      stopifnot(citygr$stat1980 > 0 | grep('female',sname) == 1) #Note I drop if they had literally zero median income in 1980 - there are a few CZs where this is true for women
      mod1 = lm(formula = anngr ~ stat1980,weights = curpop,data = filter(citygr,stat1980 >0)) 
      betas = c(betas,mod1$coefficients[2])
      
      #Plot annual growth rate vs initial level
      pdf(paste('output/ipums/',fname,'/droptop/',sname,'/dt_beta_',sname,'_',st,droptop,'.pdf',sep = ''),width = 6)
      gplt = ggplot(citygr, aes(x = stat1980, y = anngr,size = curpop)) +
        geom_point(alpha = .3) +
        theme_bw() +
        scale_x_continuous(labels = dollar, name = paste('1980',cityname,stname,tname,'income'))+
        scale_y_continuous(labels = percent, name = paste('Annualized growth rate in',stname,tname,'income, 1980-2013')) +
        geom_abline(intercept = mod1$coefficients[1], slope = mod1$coefficients[2],lty = 2) +
        scale_size_continuous(range = c(0,5), labels = comma, name = paste(cityname, '\npopulation, 2013')) +
        theme(legend.position = 'bottom')
        # ggtitle(paste('Beta convergence of ',cityname,' ',stname,' ',tname,' income\n1980-2013',dtname,sep = ''))
      
      print(gplt)
      dev.off()
      
        
        }
      dtbeta = rbind(dtbeta, betas)
      
      } #stat
  } #samp
  dtbeta = data.frame(dtbeta)
  names(dtbeta) = c('samp','stat','d0','d1','d5','d10')
  write.csv(dtbeta,paste('output/ipums/',fname,'/droptop/dt_beta_',fname,'.csv',sep = ''),row.names = F)
  
  dtsig = data.frame(dtsig)
  write.csv(dtsig,paste('output/ipums/',fname,'/droptop/dt_sigma_',fname,'.csv',sep = ''),row.names = F)
  
  
} #var
