# Take raw IPUMS data and convert to yearly CZ/MSA datasets

library(dplyr)
library(readstata13)
library(stringr)

setwd('~/projects/regionineq/')

#Bring in main IPUMS 
bigd = read.dta13('data/ipums/usa_00029.dta')

#Replace 999999s with missing
bigd[bigd$inctot == 9999999,'inctot'] = NA
bigd[bigd$ftotinc == 9999999,'ftotinc'] = NA
bigd[bigd$hhincome == 9999999,'hhincome'] = NA

#In 2006 to 2011 ACS, Louisiana PUMAs 01801, 01802, and 01905 are all coded as 77777
#As described at https://usa.ipums.org/usa-action/variables/PUMA#codes_section
#01801 and 01802 map to Orleans parish, while 01905 is split across Jefferson, Plaquemines, and St. Bernard Parish
#All of these PUMAs map to the New Orleans MSA and the New Orleans City CZ
#To handle this, I am replacing "77777" with PUMA "1801" which is New Orleans
stopifnot(bigd[bigd$puma == '77777' & !is.na(bigd$puma),'year'] %in% c(2010,2015))
stopifnot(bigd[bigd$puma == '77777' & !is.na(bigd$puma),'statefip'] == '22')
bigd[bigd$puma == '77777' & !is.na(bigd$puma),'puma'] = '1801'

#Make datasets for each year for MSA and CZ
#1980
d80 = filter(bigd, year == 1980)
d80$cntygrp = paste(str_pad((d80$statefip),width = 2, pad = '0'),str_pad(as.character(d80$cntygp98),width = 3,pad = '0'),sep = '')

#MSAs
msa80 = read.csv('data/mable/puma8_to_msa.csv')
msa80$cntygrp = str_pad(msa80$cntygrp,width = 5,pad = '0')
msa80 %>% group_by(cntygrp) %>% summarise(v = sum(afactor)) %>% summary() #all sum to 1. Good

#Merge and create new weights based on allocation factor - percentage of PUMA's residents in each county, as in Dorn 2009
dta8m = merge(msa80, d80,by = 'cntygrp', all.x = T)
dta8m$hcwt = dta8m$hhwt * dta8m$afactor
dta8m$pcwt = dta8m$perwt * dta8m$afactor
stopifnot(abs(sum(dta8m$pcwt) - sum(d80$perwt)) <.5)
dta8m = dta8m[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','cntygrp','msa','msaname')]
save(dta8m ,file = 'data/ipums/ip80msa.RData')

#CZs
cz80 = read.csv('data/mable/puma8_to_cz.csv')
cz80$cntygrp = str_pad(cz80$cntygrp,width = 5,pad = '0')

cz80 %>% group_by(cntygrp) %>% summarise(v = sum(afactor)) %>% summary() #all sum to 1. Good

dta8c = merge(cz80, d80,by = 'cntygrp', all.x = T)
dta8c$hcwt = dta8c$hhwt * dta8c$afactor
dta8c$pcwt = dta8c$perwt * dta8c$afactor
stopifnot(abs(sum(dta8c$pcwt) - sum(d80$perwt)) <.5)
dta8c = dta8c[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','cntygrp','czone','czname')]
save(dta8c ,file = 'data/ipums/ip80cz.RData')

#1990
d90 = filter(bigd, year == 1990)
d90$puma9 = paste(str_pad((d90$statefip),width = 2, pad = '0'),str_pad(as.character(d90$puma),width = 5,pad = '0'),sep = '')

#MSA
msa90 = read.csv('data/mable/puma9_to_msa.csv')
msa90$puma9 = str_pad(msa90$puma9,width = 7,pad = '0')

msa90 %>% group_by(puma9) %>% summarise(v = sum(afactor)) %>% summary() #all sum to 1. Good

dta9m = merge(msa90, d90, by = 'puma9', all.x =T)
dta9m$hcwt = dta9m$hhwt * dta9m$afactor
dta9m$pcwt = dta9m$perwt * dta9m$afactor
stopifnot(abs(sum(dta9m$pcwt) - sum(d90$perwt)) <.00005 * sum(dta9m$perwt)) #Off by 27, which is a little weird
dta9m = dta9m[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma9','msa','msaname')]
save(dta9m ,file = 'data/ipums/ip90msa.RData')

#CZ
cz90 = read.csv('data/mable/puma9_to_cz.csv')
cz90$puma9 = str_pad(cz90$puma9,width = 7,pad = '0')

cz90test = cz90 %>% group_by(puma9) %>% summarise(v = sum(afactor)) %>% data.frame()
summary(cz90test) #Almost all sum to 1. Not quite
filter(cz90test, v > 1) #There is one that is 1.014, and several that are 1.000001. 
filter(cz90, puma9 == '0200300') #The 1.014 one is in Alaska where we had to do some manual adjustment, including Kotzebue as noted in prep_puma_lookups

dta9c = merge(cz90, d90, by = 'puma9', all.x =T)
dta9c$hcwt = dta9c$hhwt * dta9c$afactor
dta9c$pcwt = dta9c$perwt * dta9c$afactor
stopifnot(abs(sum(dta9c$pcwt) - sum(d90$perwt)) <.00005 * sum(dta9c$perwt)) #Off by 27, which is a little weird
dta9c = dta9c[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma9','czone','czname')]
save(dta9c ,file = 'data/ipums/ip90cz.RData')

#2000
d00 = filter(bigd, year == 2000)
d00$puma0 = paste(str_pad((d00$statefip),width = 2, pad = '0'),str_pad(as.character(d00$puma),width = 4,pad = '0'),sep = '')

#MSA
msa00 = read.csv('data/mable/puma0_to_msa.csv')
msa00$puma0 = str_pad(msa00$puma0,width = 6,pad = '0')

msa00 %>% group_by(puma0) %>% summarise(v = sum(afactor)) %>% summary() #All sum to 1

dta0m = merge(msa00, d00, by = 'puma0', all.x =T)
dta0m$hcwt = dta0m$hhwt * dta0m$afactor
dta0m$pcwt = dta0m$perwt * dta0m$afactor
stopifnot(abs(sum(dta0m$pcwt) - sum(d00$perwt)) <.00005 * sum(dta0m$perwt)) #Not perfect which is annoying but off by only a tiny bit. Because they dont' sum to exactly 1 above probably
dta0m = dta0m[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma0','msa','msaname')]
save(dta0m ,file = 'data/ipums/ip00msa.RData')

#CZ
cz00 = read.csv('data/mable/puma0_to_cz.csv')
cz00$puma0 = str_pad(cz00$puma0,width = 6,pad = '0')

cz00 %>% group_by(puma0) %>% summarise(v = sum(afactor)) %>% summary() #not all sum to exactly 1. Hmmmm.
cz00test = cz00 %>% group_by(puma0) %>% summarise(v = sum(afactor)) %>% data.frame()
summary(cz00test) #Almost all sum to 1. Not quite
filter(cz00test, v > 1) #There is one that is 1.016, and several that are 1.000001. 
filter(cz00, puma0 == '020400') #The 1.016 one is in Alaska and again includes Kotzebue

dta0c = merge(cz00, d00, by = 'puma0', all.x =T)
dta0c$hcwt = dta0c$hhwt * dta0c$afactor
dta0c$pcwt = dta0c$perwt * dta0c$afactor
stopifnot(abs(sum(dta0c$pcwt) - sum(d00$perwt)) <.00005 * sum(dta0c$perwt)) #Not perfect which is annoying but off by only a tiny bit. Because they dont' sum to exactly 1 above probably
dta0c = dta0c[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma0','czone','czname')]
save(dta0c ,file = 'data/ipums/ip00cz.RData')


#2006-2010 - still uses 2000 PUMAs because sample years are 06-10
d10 = filter(bigd, year == 2010)

#Label as 2008 because that's the middle year of the sample period
d10$year == 2008

#Make PUMA
d10$puma0 = paste(str_pad((d10$statefip),width = 2, pad = '0'),str_pad(as.character(d10$puma),width = 4,pad = '0'),sep = '')

#MSA
msa00 = read.csv('data/mable/puma0_to_msa.csv')
msa00$puma0 = str_pad(msa00$puma0,width = 6,pad = '0') #Still using 2000 PUMAs because it's ACS

dta1m = merge(msa00, d10,by = 'puma0', all.x = T)
dta1m$hcwt = dta1m$hhwt * dta1m$afactor
dta1m$pcwt = dta1m$perwt * dta1m$afactor
#Only missing data are the two LA PUMAs who I didn't assign people to, 1802 and 1905
filter(dta1m, is.na(pcwt))
stopifnot(sum(is.na(dta1m$pcwt)) == 2)
#Drop those two rows
dta1m = filter(dta1m, !is.na(pcwt)) 
stopifnot(abs(sum(dta1m$pcwt) - sum(d10$perwt)) <.00005 * sum(dta1m$perwt)) 

dta1m = dta1m[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma0','msa','msaname')]
save(dta1m ,file = 'data/ipums/ip08msa.RData')

#CZ
cz00 = read.csv('data/mable/puma0_to_cz.csv')
cz00$puma0 = str_pad(cz00$puma0,width = 6,pad = '0') #Still using 2000 PUMAs because it's ACS

dta1c = merge(cz00, d10,by = 'puma0', all.x = T)
dta1c$hcwt = dta1c$hhwt * dta1c$afactor
dta1c$pcwt = dta1c$perwt * dta1c$afactor
#Only missing data are the two LA PUMAs who I didn't assign people to, 1802 and 1905
filter(dta1c, is.na(pcwt))
stopifnot(sum(is.na(dta1c$pcwt)) == 2)
#Drop those two rows
dta1c = filter(dta1c, !is.na(pcwt)) 
stopifnot(abs(sum(dta1c$pcwt) - sum(d10$perwt)) <.00005 * sum(dta1c$perwt)) #Not great, and also had to delete

dta1c = dta1c[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma0','czone','czname')]
save(dta1c ,file = 'data/ipums/ip08cz.RData')


#2011-2015 - Have to combine 2000 and 2010 PUMAs
dt15 = filter(bigd, year == 2015)

#Label as 2013 because that's the midpoint
dt15$year = 2013

dt15$pumayear = 2000
dt15[dt15$multyear %in% c(2012,2013,2014,2015), 'pumayear'] = 2010 #Switch to 2010 pumas in 2012 from https://usa.ipums.org/usa-action/variables/PUMA#description_section
dt15[dt15$pumayear == 2010,'puma1'] = paste(str_pad((dt15[dt15$pumayear == 2010,'statefip']),width = 2, pad = '0'),str_pad(as.character(dt15[dt15$pumayear == 2010,'puma']),width = 5,pad = '0'),sep = '')
dt15[dt15$pumayear == 2000,'puma1'] = paste(str_pad((dt15[dt15$pumayear == 2000,'statefip']),width = 2, pad = '0'),str_pad(as.character(dt15[dt15$pumayear == 2000,'puma']),width = 4,pad = '0'),sep = '')

#MSA
msa10 = read.csv('data/mable/puma1_to_msa.csv')
msa10$pumayear = 2010
msa10$puma1 = str_pad(msa10$puma1, width = 7, pad = '0')

msa10 %>% group_by(puma1) %>% summarise(v = sum(afactor)) %>% summary() #All sum to 1

msa00 = read.csv('data/mable/puma0_to_msa.csv')
msa00$puma1 = str_pad(msa00$puma0,width = 6,pad = '0') #Still using 2000 PUMAs because it's ACS
msa00$pumayear = 2000

msa15 = rbind(msa10[,c('msa','msaname','pumayear','puma1','afactor')],msa00[,c('msa','msaname','pumayear','puma1','afactor')])

dta15m = merge(msa15, dt15,by = c('puma1','pumayear'), all = T)
dta15m$hcwt = dta15m$hhwt * dta15m$afactor
dta15m$pcwt = dta15m$perwt * dta15m$afactor

#Only missing data are the two LA PUMAs who I didn't assign people to, 1802 and 1905
filter(dta15m, is.na(pcwt))
stopifnot(sum(is.na(dta15m$pcwt)) == 2)
#Drop those two rows
dta15m = filter(dta15m, !is.na(pcwt)) 
stopifnot(abs(sum(dta15m$pcwt) - sum(dt15$perwt)) <.00005 * sum(dt15$perwt)) 
dta15m = dta15m[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma1','msa','msaname')]
save(dta15m ,file = 'data/ipums/ip13msa.RData')

#CZ
cz10 = read.csv('data/mable/puma1_to_cz.csv')
cz10$pumayear = 2010
cz10$puma1 = str_pad(cz10$puma1, width = 7, pad = '0')

cz10 %>% group_by(puma1) %>% summarise(v = sum(afactor)) %>% summary() #not all sum to exactly 1.
cz10test = cz10 %>% group_by(puma1) %>% summarise(v = sum(afactor)) %>% data.frame()
summary(cz10test) #Not quite all
filter(cz10test, v > 1) #There is one that is 1.014864, and several that are 1.000001. 
filter(cz10, puma1 == '0200400') #The 1.014864 one is in Alaska and again includes Kotzebue

cz00 = read.csv('data/mable/puma0_to_cz.csv')
cz00$puma1 = str_pad(cz00$puma0,width = 6,pad = '0') #Still using 2000 PUMAs because it's ACS
cz00$pumayear = 2000

cz15 = rbind(cz10[,c('czone','czname','pumayear','puma1','afactor')],cz00[,c('czone','czname','pumayear','puma1','afactor')])

dta15c = merge(cz15, dt15,by = c('puma1','pumayear'), all = T)
dta15c$hcwt = dta15c$hhwt * dta15c$afactor
dta15c$pcwt = dta15c$perwt * dta15c$afactor

#Only missing data are the two Louisiana PUMAs who I didn't assign people to, 1802 and 1905
filter(dta15c, is.na(pcwt))
stopifnot(sum(is.na(dta15c$pcwt)) == 2)
#Drop those two rows
dta15c = filter(dta15c, !is.na(pcwt)) 
stopifnot(abs(sum(dta15c$pcwt) - sum(dt15$perwt)) <.00005 * sum(dt15$perwt)) 

#Save
dta15c = dta15c[,c('year','serial','hhwt','statefip','gq','hhincome','pernum','perwt','famsize','famunit','relate','related','sex','age','race','raced','hispan','hispand','bpl','bpld','educ','educd','occ1990','ind1990','inctot','ftotinc','empstatd','labforce','empstat','pcwt','hcwt','puma1','czone','czname')]
save(dta15c ,file = 'data/ipums/ip13cz.RData')

