#Make crosswalks from PUMAs to 2013 MSAs and 1990 CZs
#Non-metro counties are assigned to one residual group per state

library(stringr)
library(dplyr)
library(foreign)

setwd('~/projects/regionineq/')

#Bring in state names for non-metropolitan counties
states = read.csv('data/reference/stfips.csv', colClasses =  'character')
states$st = str_pad(states$FIPS.Code, width = 2, pad = '0')
states = states[,c(3,2)]
names(states) = c('st','stname')

#Bring in county to MSA lookup, 2013 version
msas = read.csv('data/reference/CountyMSA_2013.csv', colClasses = 'character')
msas$st = str_pad(msas$FIPS.State.Code,width = 2, pad = '0')
msas$cnty = str_pad(msas$FIPS.County.Code,width = 3, pad = '0')
msas$fips = paste(msas$st, msas$cnty,sep = '')
msas = msas[,c('fips','CBSA.Code','CBSA.Title')]
names(msas) = c('fips','msa','msaname')

#Manually add counties added since 1990
msas[nrow(msas)+1,] = c('15901','27980','Kahului-Wailuku-Lahaina, HI') #Maui and Kalawao County HI -> Kahului
msas[nrow(msas)+1,] = c('46102','46','South Dakota non-Metro') #Ogalala Lakota County -> Gordon SD
msas[nrow(msas)+1,] = c('55901','43020','Shawano, WI') #Shawano + Menominee County -> Shawano WI
msas[nrow(msas)+1,] = c('02201','2','Alaska non-Metro') #Three Alaska counties that aren't current to the MSAs list are all non-metro according to https://en.wikipedia.org/wiki/Alaska_statistical_areas
msas[nrow(msas)+1,] = c('02231','2','Alaska non-Metro') 
msas[nrow(msas)+1,] = c('02280','2','Alaska non-Metro') 

#Bring in counties to CZs lookup
czs = read.csv('data/reference/czlma903.csv')
czs$fips = str_pad(czs$County.FIPS.Code, 5,pad = '0')
czs$czone = str_pad(czs$CZ90,5,pad = '0')
czs = select(czs, fips, czone)

#Manually add CZs for counties added since 1990
czs[nrow(czs)+1,] = c('08014','28900') #Broomfield County CO -> Denver CZ
czs[nrow(czs)+1,] = c('12086','07000') #Miami-Dade County FL -> Miami CZ
czs[nrow(czs)+1,] = c('15901','34703') #Maui and Kalawao County HI -> Kahului
czs[nrow(czs)+1,] = c('46102','27704') #Ogalala Lakota County -> Gordon SD
czs[nrow(czs)+1,] = c('55901','22602') #Shawano + Menominee County -> Shawano WI
czs[nrow(czs)+1,] = c('02140','34105') #One Alaska area in 1980 that doesn't get caught. It's "Kobuk Census Area" which based on where Kobuk is seems to translate to Kotzebue

#Bring in AK crosswalk from BEA county groups to counties
akxw = read.csv('data/bea/st02_alaska_lookup.csv')
akxw$trufips = str_pad(akxw$Czfips, width = 5, pad = '0')
akxw$akcombine = str_pad(akxw$GeoFIPS, width = 5, pad = '0')
akxw = akxw %>% filter(!is.na(akxw$akcombine))

#Make updated list of AK commuting zones
akcz = merge(akxw, czs, by.x = 'trufips', by.y = 'fips')
stopifnot(!is.na(akcz$czone)) #All these mismatched places now have czs
akcz = select(akcz, akcombine, czone)
names(akcz) = c('fips','czone')
czs = rbind(czs, akcz)

#Bring in CZ names from Chetty et al 2014 online data tables
cznames = read.csv('data/reference/cznames_chetty2014.csv')
cznames$czone = str_pad(cznames$CZ,5,pad = '0')
cznames$czname = paste(cznames$CZ.Name,cznames$State,sep = ', ')
cznames = select(cznames, czone, czname)
czs = merge(czs, cznames, by = 'czone', all = T)
stopifnot(!is.na(czs$czname))


#Make PUMA-MSA and PUMA-CZ crosswalks

#First 2010
#Bring in crosswalk of PUMAs to counties for 2010
pumas = read.csv('data/mable/geocorr2010.csv',skip = 1)
pumas$fips = str_pad(pumas$county,width = 5,pad = '0')
pumas$st = str_pad(pumas$FIPS.state,width = 2,pad = '0')
pumas$puma1 = paste(str_pad(pumas$FIPS.state,width = 2,pad = '0'),str_pad(pumas$puma12,width = 5, pad ='0'),sep = '')

#Merge on MSAs to counties
pumas = merge(pumas, msas,by = 'fips',all.x = T)

#Merge on state
pumas = merge(pumas, states, by = 'st', all.x = T)

#Identify non-metro areas
pumas[is.na(pumas$msa),'msaname'] = paste(pumas[is.na(pumas$msa),'stname'],'non-Metro')
pumas[is.na(pumas$msa),'msa'] = pumas[is.na(pumas$msa),'st']

#Aggregate PUMAs to MSAs
puma1 = pumas %>% group_by(puma1,msa,msaname) %>% summarise(afactor = sum(puma12.to.county.alloc.factor),cfactor = sum(county.to.puma12.alloc.factor)) %>% data.frame()
stopifnot(sum(puma1$afactor) == sum(pumas$puma12.to.county.alloc.factor))

#save
write.csv(puma1, 'output/mable/puma1_to_msa.csv',row.names = F)

##Now do CZs
#Merge on CZs to counties
pumas = merge(pumas, czs, by = 'fips', all.x = T)
stopifnot(!is.na(pumas$czname))

#Aggregate PUMAs to CZs
puma1 = pumas %>% group_by(puma1,czone,czname) %>% summarise(afactor = sum(puma12.to.county.alloc.factor),cfactor = sum(county.to.puma12.alloc.factor)) %>% data.frame()
stopifnot(sum(puma1$afactor) == sum(pumas$puma12.to.county.alloc.factor))

#save
write.csv(puma1, 'output/mable/puma1_to_cz.csv',row.names = F)

## Also make full list of counties and MSAs in 2010 census. Note this doesn't have the manually added counties above
cntylist = pumas %>% group_by(fips,msa,msaname) %>% summarise(cntpm = length(msa), pop = sum(Total.Pop..2010.census)) %>% data.frame()
write.csv(cntylist, 'output/mable/county_msa_list.csv',row.names = F)

##Now do 2000
#Bring in crosswalk of PUMAs to counties 
pumas = read.csv('data/mable/geocorr2000.csv',skip = 1)
pumas$fips = str_pad(pumas$county,width = 5,pad = '0')
pumas$st = str_pad(pumas$FIPS.state,width = 2,pad = '0')
pumas$puma0 = paste(str_pad(pumas$FIPS.state,width = 2,pad = '0'),str_pad(pumas$puma2k,width = 4, pad ='0'),sep = '')

#Merge on MSAs to counties
pumas = merge(pumas, msas,by = 'fips',all.x = T)

#Merge on state
pumas = merge(pumas, states, by = 'st', all.x = T)

#Flag non-metro areas
pumas[is.na(pumas$msa),'msaname'] = paste(pumas[is.na(pumas$msa),'stname'],'non-Metro')
pumas[is.na(pumas$msa),'msa'] = pumas[is.na(pumas$msa),'st']

#Aggregate PUMAs to MSAs
puma0 = pumas %>% group_by(puma0,msa,msaname) %>% summarise(afactor = sum(puma2k.to.county.alloc.factor),cfactor = sum(county.to.puma2k.alloc.factor)) %>% data.frame()
stopifnot(sum(puma0$afactor) == sum(pumas$puma2k.to.county.alloc.factor))

#save
write.csv(puma0, 'output/mable/puma0_to_msa.csv',row.names = F)

##Now do CZs
#Merge on CZs to counties
pumas = merge(pumas, czs, by = 'fips', all.x = T)
stopifnot(!is.na(pumas$czname))

#Aggregate PUMAs to CZs
puma0 = pumas %>% group_by(puma0,czone,czname) %>% summarise(afactor = sum(puma2k.to.county.alloc.factor),cfactor = sum(county.to.puma2k.alloc.factor)) %>% data.frame()
stopifnot(sum(puma0$afactor) == sum(pumas$puma2k.to.county.alloc.factor))

#save
write.csv(puma0, 'output/mable/puma0_to_cz.csv',row.names = F)

##Now do 1990
#Bring in crosswalk of PUMAs to counties 
pumas = read.csv('data/mable/geocorr90.csv', colClasses = 'character')
pumas$fips = str_pad(pumas$county,width = 5,pad = '0')
pumas$st = str_pad(pumas$STATE.CODE..FIPS.,width = 2,pad = '0')
pumas$puma9 = paste(str_pad(pumas$STATE.CODE..FIPS.,width = 2,pad = '0'),str_pad(pumas$PUMA.code.from.1990.5...A..Sample,width = 5, pad ='0'),sep = '')

#Merge on MSAs to counties
pumas = merge(pumas, msas,by = 'fips',all.x = T)

#Merge on state
pumas = merge(pumas, states, by = 'st', all.x = T)

#Flag non-metro areas
pumas[is.na(pumas$msa),'msaname'] = paste(pumas[is.na(pumas$msa),'stname'],'non-Metro')
pumas[is.na(pumas$msa),'msa'] = pumas[is.na(pumas$msa),'st']

#Aggregate PUMAs to commuting zones
puma9 = pumas %>% group_by(puma9,msa,msaname) %>% summarise(afactor = sum(as.numeric(apuma.to.county.alloc.factor)),cfactor = sum(as.numeric(county.to.apuma.alloc.factor))) %>% data.frame()
stopifnot(sum(puma9$afactor) == sum(as.numeric(pumas$apuma.to.county.alloc.factor)))

#save
write.csv(puma9, 'output/mable/puma9_to_msa.csv',row.names = F)

##Now do CZs
#Merge on CZs to counties
pumas = merge(pumas, czs, by = 'fips', all.x = T)
stopifnot(!is.na(pumas$czname))

#Aggregate PUMAs to CZs
puma9 = pumas %>% group_by(puma9,czone,czname) %>% summarise(afactor = sum(as.numeric(apuma.to.county.alloc.factor)),cfactor = sum(as.numeric(county.to.apuma.alloc.factor))) %>% data.frame()
stopifnot(sum(puma9$afactor) == sum(as.numeric(pumas$apuma.to.county.alloc.factor)))

#save
write.csv(puma9, 'output/mable/puma9_to_cz.csv',row.names = F)



#Now make 1980 county group to county lookup
cg = read.csv('data/mable/cg98stat.csv')
cg$afactor =as.numeric(as.character(cg$County.Population)) / as.numeric(as.character(cg$County.Group.Population))
table(cg[is.na(cg$afactor),'State.FIPS.Code']) #Almost all the missings in Puerto Rico which doesn't have pop data
cg = filter(cg, !is.na(as.numeric(as.character(County.Population)))) #Drop missing
stopifnot(!is.na(cg$afactor))

cg$st = str_pad(cg$State.FIPS.Code, width = 2, pad = "0")
cg$cnty = str_pad(cg$County.FIPS.code, width = 3,pad = '0')
cg$fips = paste(cg$st, cg$cnty, sep = '')

cg$cntygrp = paste(cg$st, str_pad(cg$X1980.County.Group,width = 3, pad = '0'),sep = '')

#Merge on MSAs to counties
cg = merge(cg, msas,by = 'fips',all.x = T)

#Merge on state
cg = merge(cg, states, by = 'st', all.x = T)

#Flag non-metro areas
cg[is.na(cg$msa),'msaname'] = paste(cg[is.na(cg$msa),'stname'],'non-Metro')
cg[is.na(cg$msa),'msa'] = cg[is.na(cg$msa),'st']

#Aggregate cg to MSAs
cg8 = cg %>% group_by(cntygrp,msa,msaname) %>% summarise(afactor = sum(as.numeric(afactor))) %>% data.frame()
stopifnot(sum(cg8$afactor) == sum(cg$afactor))

#save
write.csv(cg8, 'output/mable/puma8_to_msa.csv',row.names = F)

#Add on CZs
cg = merge(cg, czs, by = 'fips', all.x = T)
stopifnot(!is.na(cg$czname))

#Aggregate CZs to MSAs and save
cg8 = cg %>% group_by(cntygrp,czone,czname) %>% summarise(afactor = sum(as.numeric(afactor))) %>% data.frame()
stopifnot(sum(cg8$afactor) == sum(cg$afactor))

#save
write.csv(cg8, 'output/mable/puma8_to_cz.csv',row.names = F)
