#Calculate the income and percentage of each city at each national percentile
#Figure out how to deal with zeroes: split evenly among bottom X percentiles
#For each city:
  #Calculate the percentage of population in each percentile
  #Allocate zeroes correctly
  #Calculate mean income within each national percentile
  #Calculate city median income in dollars and in terms of national percentiles
#Output:
  #National list of percentile cutoffs, total population in each percentile, mean and median income in each percentile
  #Metro list of population in each percentile, mean and median income in each percentile, 
  #Metro list of mean and median income for each metro


library(tidyverse)
library(Hmisc)
library(stringr)
# library(scales)
library(readstata13)
# library(ggplot2)

setwd('~/projects/regionineq/')

#Bring in CPI
cpirs = read.dta13('data/reference/cpirs.dta')
cpi15 = cpirs[cpirs$year == 2015,'rate']

#Going to loop over MSAs vs CZs
varnames = c('msa','czone')
filenames = c('msa','cz')
cleannames = c('msaname','czname')
citynames = c('MSA','Commuting Zone')

#Make ipums output dir
dir.create('output/ipums')

for(i in   2){ #1:length(varnames)){  #
  vname = varnames[i]
  fname = filenames[i]
  cname = cleannames[i]
  cityname = citynames[i]
  
  dir.create(paste('output/ipums/',fname,sep = ''))
  dir.create(paste('output/ipums/',fname,'/dta',sep = ''))
  

  #Loop through years
  for(yr in c('80','90','00','08','13')){ #
    fullyr = as.numeric(yr) + 2000
    if(yr %in% c('80','90')) fullyr = fullyr - 100
    print(paste(yr, vname))
    cpiyr = cpirs %>% filter(year == fullyr - 1) %>% select(rate) %>% unlist()
    #Note 2006-2010 uses 2010 dollars, 2011-2015 uses 2013 dollars. I think - double check
    if(yr == '08') cpirs %>% filter(year == 2010 - 1) %>% select(rate) %>% unlist() 
    if(yr == '13') cpirs %>% filter(year == 2015 - 1) %>% select(rate) %>% unlist()

    #Make dataframe to store national information from each sample
    natinfo =NULL 
    
    #Bring in data
    dtfile = load(paste('data/ipums/ip',yr,fname,'.RData',sep = ''))
    dta = get(dtfile)
    
    #Make city identfier
    dta$city = dta[,vname]
    dta$cityname = as.character(dta[,cname])
    
    #Inflate 
    dta$hhincome = dta$hhincome * cpi15 / cpiyr
    dta$inctot = dta$inctot * cpi15 / cpiyr
    dta$ftotinc = dta$ftotinc * cpi15 / cpiyr
    
    #Make normalized hh and family incomes
    dta = dta %>% group_by(year, serial,city) %>% mutate(hhsize = length(serial)) %>% data.frame()
    dta$hhincsqrt = dta$hhincome / sqrt(dta$hhsize)
    dta$hhincnorm = dta$hhincome / dta$hhsize
    
    dta = dta %>% group_by(year, serial,famunit,city) %>% mutate(fsize = length(serial),finc = sum(inctot,na.rm = T)) %>% data.frame()
    dta$fincsqrt = dta$finc / sqrt(dta$fsize)
    dta$fincnorm = dta$finc / dta$fsize
    
    #All samples are person-weighted
    #Samples are household, family, normalized hh, normalize family, adult individual, adult individual male, adult individual female income

    #Samples
    fam = dta
    hh = dta
    famsqrt = dta
    famnorm = dta
    hhsqrt = dta
    hhnorm = dta
    adult = dta %>% filter(age >= 18)
    adultmale = dta %>% filter(age >= 18, sex == 'Male')
    adultfemale = dta %>% filter(age >= 18, sex == 'Female')  
    
    #7% of 2015 prime males had no income
    #1% of 2015 households had no income
    
    #Maybe I don't separate out  bug weight by number of people in that group in addition
    #So segregation of people with zero income is bigger
    #Or this may come out by repleating it a bunch of times, once per percentile
    
    #But if city percent zero isn't in line with nation, do you assign it to 0th or 5th percentile of nation?
    #Maybe spread evenly?
    #And then I guess don't put it into teh box that has some zeros and some not.
    
    #And then in simulations, assign the first X% of zerose to Xth percentile, adn then move on. 
    #And re-calcualte Jth percentile mean to be weighted average of that and zero.
    
    #For each percentile, get national population and national mean in that percentile, 
    #Then for each city get percentage and mean in that percentile
    
    #Output:
    #year x percentile x city file
    #Each row have 
        #percent of city in that percentile in that year
        #mean of that city in that percentile year
    
    
    #Loop through samples 
    samps = list(fam,famsqrt,famnorm,hh,hhsqrt,hhnorm,adult,adultmale,adultfemale)
    snames = c('fam','famsqrt','famnorm','hh','hhsqrt','hhnorm','adult','adultmale','adultfemale')
    # tnames = c('family','sqrt-normalized family','household','family','adult individual','adult male','adult female')
    
    for(s in 1:length(samps)){
      
      samp = samps[[s]]
      sname = snames[s]
      # titname = tnames[s]
      print(sname)
      dir.create(paste('output/ipums/',fname,'/dta/',sname,sep = ''))
      
      #Get weights - weighting by person in all cases
      samp$wt = samp$pcwt

      #Get income
      if(sname %in% c('fam')) samp$inc = samp$ftotinc
      if(sname %in% c('famsqrt')) samp$inc = samp$fincsqrt
      if(sname %in% c('famnorm')) samp$inc = samp$fincnorm
      if(sname %in% c('hh')) samp$inc = samp$hhincome
      if(sname %in% c('hhsqrt')) samp$inc = samp$hhincsqrt
      if(sname %in% c('hhnorm')) samp$inc = samp$hhincnorm
      if(sname %in% c('adult','adultmale','adultfemale')) samp$inc = samp$inctot
      
      #Drop missing incomes
      samp = filter(samp, !is.na(inc))
      
      #Treat negative incomes as zeros
      samp[samp$inc <0, 'inc'] = 0
      # samp = filter(samp, inc >= 0)
      
      #Get sample of overall distribution for speed
      # natincsamp = base::sample(samp$inc, prob = samp$wt, size = 100000, replace = T)
      
      #Get national percentiles
      natpcts = wtd.quantile(samp$inc, weights = samp$wt, probs = 1:100 / 100)
      
      # npctraw = cbind(1:100,natpcts) %>% data.frame()
      # names(npctraw) = c('pct','uplim')
      # 
      # #Now identify cases where one value crosses multiple percentiles and write out the ranges
      # npcts = npctraw %>% group_by(uplim) %>% summarise(pmin = min(pct),pmax = max(pct)) %>% data.frame()
      # npcts$pgroup = 1:nrow(npcts)
      # 
      # #Assign people to national percentiles
      # samp$pgroup = cut(samp$inc,breaks = c(-1,npcts$uplim),labels = npcts$pgroup)
      # samp = merge(samp,npcts, by = 'pgroup')
      # 
      # #
      # natstats = samp %>% group_by(pmin,pmax) %>% summarise(cnt = sum(wt), mean = wtd.mean(inc, weights = wt), median = wtd.quantile(inc, weights = wt,probs = .5)) %>% data.frame()
      # natstats$pcts = natstats$toppct - natstats$botpct
      
      #Identify and account for cases where more than 1% of population has same income (e.g. $0)
      #Get names for each set of percentiles that have the same income
      pctnames = NULL
      pstart = 'p0to'
      natdupes = NULL
      for(pct in 1:99){
        if(abs(natpcts[pct] - natpcts[pct+1]) < .000001){
          natdupes = c(natdupes,pct)
          next()
        }
        
        pname = paste(pstart,pct,sep = '')
        pctnames = c(pctnames, pname)
        pstart = paste('p',pct,'to',sep = '')
        
      }
      pname = paste(pstart,100,sep = '')
      pctnames = c(pctnames, pname)
      
      #Get deduplicated list for slicing
      pctcuts = c(-1,natpcts+.1)
      if(length(natdupes) > 0) pctcuts = c(-1,natpcts[-natdupes]+.1)
  
      #Assign people to national percentiles
      samp$pctile = cut(samp$inc,breaks = pctcuts,labels = pctnames)
      
      #Get count and mean and median by percentiles
      natstats = samp %>% group_by(pctile) %>% summarise(cnt = sum(wt), mean = wtd.mean(inc, weights = wt), median = wtd.quantile(inc, weights = wt,probs = .5)) %>% data.frame()
      
      #get top and bottom percentiles
      natstats$botpct = str_extract(natstats$pctile,'p[0-9][0-9]?') %>% str_replace('p','') %>% as.numeric()
      natstats$toppct = str_extract(natstats$pctile,'to[0-9][0-9]?[0-9]?') %>% str_replace('to','') %>% as.numeric()
      natstats$pcts = natstats$toppct - natstats$botpct
      
      #Now make 100 row dataset with the count, mean, and median for each percentile, dividing people in multi-percentile buckets evenly
      nat100stats = NULL
      for(p in 1:100) {
        rowint = filter(natstats, toppct >= p, botpct < p)
        upperlim = natpcts[p]
        stopifnot(nrow(rowint) < 2) #Always either 0 or 1 row meets criteria
        if(nrow(rowint)==0) {
          pcnt = 0
          pmean = 0
          pmed = 0
          ppct = 0
        }
        if(nrow(rowint) == 1){
          pmean = rowint$mean
          pmed = rowint$median
          pcnt = rowint$cnt / rowint$pcts
          ppct = pcnt / sum(natstats$cnt)
        }
        nat100stats = rbind(nat100stats, c(p,pcnt,ppct,pmean,pmed,upperlim,fullyr))
      }
      
      #Export national percentile dataset
      nat100stats = data.frame(nat100stats)
      names(nat100stats) = c('p','cnt','pct','mean','med','upperlim','year')
      write.csv(nat100stats,paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_nat_',sname,'_',fullyr,'.csv',sep = ''),row.names = F)
        
      #Now make a 100 * cities row dataset with same stats for each city
      citystats = samp %>% group_by(city,cityname,pctile) %>% summarise(cnt = sum(wt), mean = wtd.mean(inc, weights = wt), median = wtd.quantile(inc, weights = wt,probs = .5)) %>% data.frame()
      
      #get top and bottom percentiles
      citystats$botpct = str_extract(citystats$pctile,'p[0-9][0-9]?') %>% str_replace('p','') %>% as.numeric()
      citystats$toppct = str_extract(citystats$pctile,'to[0-9][0-9]?[0-9]?') %>% str_replace('to','') %>% as.numeric()
      citystats$pcts = citystats$toppct - citystats$botpct
      
      #Loop through each city and percentile
      city100stats = NULL
      for(ci in unique(citystats$city)) {
        cstats = NULL
        cpop = filter(citystats, city == ci) %>% select(cnt) %>% sum()
        print(ci)
        for(p in 1:100){
          # baset = Sys.time()
          rowint = filter(citystats, toppct >= p, botpct < p, city == ci)
          # print(Sys.time() - baset)
          numrows = dim(rowint)[1]
          # print(Sys.time() - baset)
          # stopifnot(numrows < 2) #The nrows take quite a bit of time, so stopping
          # print(Sys.time() - baset)
          if(numrows==0) {
            ciname = ''
            pcnt = 0
            pmean = NA
            pmed = NA
            ppct = 0
          } else {
          # print(Sys.time() - baset)
          # if(numrows == 1){
            # print(Sys.time() - baset)
            ciname = rowint$cityname
            pmean = rowint$mean
            pmed = rowint$median
            pcnt = rowint$cnt / rowint$pcts
            # print(Sys.time() - baset)
            ppct = pcnt / cpop
            # print(Sys.time() - baset)
          }
          # print(Sys.time() - baset)
          cstats = rbind(cstats, c(ci,ciname,p,pcnt,ppct,pmean,pmed,fullyr))
          # print(Sys.time() - baset)
          
        }
        city100stats = rbind(city100stats,cstats)
      }
      
      city100stats = data.frame(city100stats)
      names(city100stats) = c(fname,cname,'p','cnt','pct','mean','med','year')
      write.csv(city100stats,paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_pct_city_',sname,'_',fullyr,'.csv',sep = ''),row.names = F)
      
      #Finally, compute mean and median of each city and nation
      
      #National mean, median, population
      natmeans = samp %>% summarise(cnt = sum(wt), nobs = length(wt), mean = wtd.mean(inc, weights = wt), median = wtd.quantile(inc, weights = wt,probs = .5)) %>% data.frame()
      natmeans$yr = yr
      natmeans$fullyr = fullyr
      natmeans$samp = sname
      natinfo = rbind(natinfo, natmeans)
      
      #City mean, median, population
      citymeans = samp %>% group_by(city,cityname) %>% summarise(cnt = sum(wt), nobs = length(wt), mean = wtd.mean(inc, weights = wt), median = wtd.quantile(inc, weights = wt,probs = .5)) %>% data.frame()
      
      write.csv(citymeans,paste('output/ipums/',fname,'/dta/',sname,'/',fname,'_citymeans_',sname,'_',yr,'.csv',sep = ''),row.names = F)

    } #sample
    
    write.csv(natinfo,paste('output/ipums/',fname,'/dta/',fname,'_natinfo_',yr,'.csv',sep = ''),row.names = F)
    
  }#year
  natinfo80 = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo_80.csv',sep = ''))
  natinfo90 = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo_90.csv',sep = ''))
  natinfo00 = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo_00.csv',sep = ''))
  natinfo08 = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo_08.csv',sep = ''))
  natinfo13 = read.csv(paste('output/ipums/',fname,'/dta/',fname,'_natinfo_13.csv',sep = ''))
  natinfo = rbind(natinfo80,natinfo90,natinfo00,natinfo08,natinfo13)
  
  write.csv(natinfo,paste('output/ipums/',fname,'/dta/',fname,'_natinfo.csv',sep = ''),row.names = F)
  
}#variable
    