library(tidyverse)
library(haven)
library(survey)
library(srvyr)
library(kamaken)
<- partial(knitr::kable, digits = 3) kable
前準備
パッケージ読み込み
データ読み込み
- PIAAC 1st cycleの日本データ(SPSS形式)
- 変数の名前を適宜変更しておく
<- read_sav('data/prgjpnp1.sav')
data # 直接読み込むこともできるが多少時間がかかる
# data <- read_sav('https://webfs.oecd.org/piaac/puf-data/SPSS/prgjpnp1.sav')
<-
df |>
data # 変数を絞る
select(
country = CNTRYID,
id = SEQID,
age = AGE_R,
gender = GENDER_R,
region = REG_TL2,
edu = B_Q01a,
medu = J_Q06b,
fedu = J_Q07b,
numbooks = J_Q08,
sampling_weight = SPFWT0,
# 読解力、数的思考力、ITスキルのスコア
matches('^PV'),
# Replicate weights
matches(str_c('SPFWT', 1:80)),
VEMETHOD|>
) # 変数名を小文字に変換
rename_with(str_to_lower) |>
as_factor() |>
mutate(
age = as.character(age) |> parse_double(),
# 年齢をカテゴリ化
agegroup = cut(
age,breaks = c(15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65),
labels = c('16-20', '21-25', '26-30', '31-35', '36-40', '41-45', '46-50', '51-55', '56-60', '61-65')
),# 大学卒業ダミー
univ = case_match(
edu,'No formal qualification or below ISCED 1' ~ 0,
'ISCED 2' ~ 0,
'ISCED 3C shorter than 2 years' ~ 0,
'ISCED 3C 2 years or more' ~ 0,
'ISCED 3A-B' ~ 0,
'ISCED 3 (without distinction A-B-C, 2y+)' ~ 0,
'ISCED 4 (without distinction A-B-C)' ~ 0,
'ISCED 5B' ~ 0,
'ISCED 5A, bachelor degree' ~ 1,
'ISCED 5A, master degree' ~ 1,
'ISCED 6' ~ 1,
'Foreign qualification' ~ NA,
NA ~ NA
) )
標準誤差の推定式
- jackknife法による標準誤差
\[ \mathrm{SE}_\theta = \sqrt{h\sum_{r=1}^{R} (\hat{\theta}_{(r)} - \hat{\theta})^2} \]
- \(R\):反復回数、replicate weightの数(PIAACは80)
- \(\hat{\theta}_{(r)}\):\(r\)番目のreplicate weightを用いた推定値
- \(\hat{\theta}\):全体の推定値(サンプリングウェイトを用いた推定値)
- \(h\):乗数、 Jackknife法のバリエーションによって異なる
- JK1の場合は\(h = \frac{R-1}{R}\)
- JK2の場合は\(h = 1\)
分析
- 点推定値
- jackknife法による標準誤差
の二つを推定する
packageによる推定
survey
パッケージ(のラッパーのsrvyr
パッケージ)を用いるtype
: 標準誤差の推定方法- PIAACでは
JK1
の国とJK2
の国が混在(変数VEMETHOD
にどちらを使用すれば良いかが書かれている) - 日本は
JK2
で推定する
- PIAACでは
<-
df_design |>
df # 調査デザインの設定
as_survey_rep(
weights = sampling_weight,
repweights = matches('spfwt'),
type = 'JK2',
mse = TRUE
)
<-
result_literacy |>
df_design # 読解力の各PVごとに平均値と標準誤差を計算
summarise(
across(matches('pvlit'), \(x) survey_mean(x, na.rm = TRUE))
)
# 結果の整理
<-
package |>
result_literacy rename_with(\(x) str_replace(x, '(\\d)$', '\\1_estimate')) |>
# 縦持ちに変換
pivot_longer(
cols = matches('pvlit'),
names_to = c('literacy', '.value'),
names_pattern = '(pvlit\\d{1,2})_(.+)',
)
kable(package)
literacy | estimate | se |
---|---|---|
pvlit1 | 296.468 | 0.549 |
pvlit2 | 296.143 | 0.529 |
pvlit3 | 296.184 | 0.543 |
pvlit4 | 296.111 | 0.541 |
pvlit5 | 296.172 | 0.522 |
pvlit6 | 296.936 | 0.543 |
pvlit7 | 295.561 | 0.481 |
pvlit8 | 296.827 | 0.518 |
pvlit9 | 296.188 | 0.521 |
pvlit10 | 295.832 | 0.542 |
手作業による推定
<-
point_estimate |>
df select(sampling_weight, matches('(pvlit|spfwt)')) |>
summarise(
across(
matches('pvlit'),
weighted.mean(x, w = sampling_weight, na.rm = TRUE),
\(x) .names = '{.col}_estimate'
)|>
) pivot_longer(
cols = matches('pvlit'),
names_to = c('literacy', '.value'),
names_pattern = '(pvlit\\d{1,2})_(.+)',
)
# jackknife法に用いるウェイト(80個)
<- select(df, matches('spfwt'))
jackweight
<-
jack_estimate # 各ウェイトを用いて、それぞれのPVの平均値を計算
map(
jackweight, \(weight) |>
df summarise(
across(
matches('pvlit'),
weighted.mean(x, w = weight, na.rm = TRUE),
\(x) .names = '{.col}_jack'
)
)|>
) bind_rows(.id = 'replicate') |>
pivot_longer(
cols = matches('pvlit'),
names_to = c('literacy', '.value'),
names_pattern = '(pvlit\\d{1,2})_(.+)',
)
<-
handmade # 点推定値にジャックナイフウェイトを用いた推定値を結合
left_join(
point_estimate,
jack_estimate, by = join_by(literacy)
|>
) # 標準誤差の計算
summarise(
estimate = mean(estimate),
se = sqrt(sum((estimate - jack)^2)),
.by = literacy
)
kable(handmade)
literacy | estimate | se |
---|---|---|
pvlit1 | 296.468 | 0.549 |
pvlit2 | 296.143 | 0.529 |
pvlit3 | 296.184 | 0.543 |
pvlit4 | 296.111 | 0.541 |
pvlit5 | 296.172 | 0.522 |
pvlit6 | 296.936 | 0.543 |
pvlit7 | 295.561 | 0.481 |
pvlit8 | 296.827 | 0.518 |
pvlit9 | 296.188 | 0.521 |
pvlit10 | 295.832 | 0.542 |
結果の比較
- 結果は一致する
left_join(
package,
handmade, by = join_by(literacy),
suffix = c('_package', '_handmade')
|>
) select(literacy, estimate_package, estimate_handmade, se_package, se_handmade) |>
mutate(
diff_estimate = estimate_package - estimate_handmade,
diff_se = se_package - se_handmade
|>
) kable()
literacy | estimate_package | estimate_handmade | se_package | se_handmade | diff_estimate | diff_se |
---|---|---|---|---|---|---|
pvlit1 | 296.468 | 296.468 | 0.549 | 0.549 | 0 | 0 |
pvlit2 | 296.143 | 296.143 | 0.529 | 0.529 | 0 | 0 |
pvlit3 | 296.184 | 296.184 | 0.543 | 0.543 | 0 | 0 |
pvlit4 | 296.111 | 296.111 | 0.541 | 0.541 | 0 | 0 |
pvlit5 | 296.172 | 296.172 | 0.522 | 0.522 | 0 | 0 |
pvlit6 | 296.936 | 296.936 | 0.543 | 0.543 | 0 | 0 |
pvlit7 | 295.561 | 295.561 | 0.481 | 0.481 | 0 | 0 |
pvlit8 | 296.827 | 296.827 | 0.518 | 0.518 | 0 | 0 |
pvlit9 | 296.188 | 296.188 | 0.521 | 0.521 | 0 | 0 |
pvlit10 | 295.832 | 295.832 | 0.542 | 0.542 | 0 | 0 |
PVの統合
読解力、数的思考力などは複数のPV(Plausible Values)という形で測定
PVを用いた分析の結果は、多重代入法の要領で統合が可能
詳しくは van Buuren (2018) などを参照
平均値
\[ \hat{\theta} = \frac{1}{M} \sum_{m=1}^{M} \hat{\theta}_{(m)} \]
- 各PVにおいて生じる分散の平均(分散の平均値)
\[ \bar{U} = \frac{1}{M} \sum_{m=1}^{M} \hat{U}_{(m)} \]
- PV間で生じる分散(平均値の分散)
\[ B = \frac{1}{M-1} \sum_{m=1}^{M} (\hat{\theta}_{(m)} - \hat{\theta})^2 \]
- 統合された推定値の標準誤差
\[ \hat{SE} = \sqrt{\bar{U} + \left(1 + \frac{1}{M}\right)B} \]
<- function(.tibble, term = NULL) {
pool_rubin |>
.tibble mutate(variance = std.error^2) |>
summarise(
M = n(),
estimate.combined = mean(estimate),
Vw = sum(variance) / M,
Vb = sum((estimate - mean(estimate)) ^ 2) / (M - 1),
Vt = Vw + (1 + 1 / M) * Vb,
SE.combined = sqrt(Vt),
.by = {{term}}
|>
) # 信頼区間計算
mutate(
conf.low = estimate.combined - qnorm(1 - .025)*SE.combined,
conf.high = estimate.combined + qnorm(1 - .025)*SE.combined
|>
) select({{term}}, M, estimate = estimate.combined, std.error = SE.combined, conf.low, conf.high)
}
- 結果の統合
|>
handmade rename(std.error = se) |>
pool_rubin() |>
kable()
M | estimate | std.error | conf.low | conf.high |
---|---|---|---|---|
10 | 296.242 | 0.685 | 294.9 | 297.585 |
より複雑なケース
因果効果の推定
- 高等教育進学が読解力に与える因果効果の推定
- G-computation(Hernán and Robins 2020; Robins 1986)によるATEの推定
- 関数定義
.outcome
:アウトカム変数.treatment
:0-1の二値変数.formula_rhs
:回帰式の右辺を指定~ treatment + covariate1 + covariate2 + ...
のような形- 固定効果も指定可能:
~ 1 | treatment + fixed_effect1 + fixed_effect2 + ...
fixest
パッケージでのformula
に準拠
.estimand
:推定対象cfmean
:\(\mathrm{E}[Y^1]\)と\(\mathrm{E}[Y^0]\)ATE
:平均処置効果
.weights
:サンプリングウェイト.repweights
:ジャックナイフ法に用いるreplicate weights.type
:ジャックナイフ法のタイプ.by
:効果の異質性を見たい変数を指定(NULL
なら集団全体)
# ATEをjackknife推定する関数
<-
piaac_ATE_jack function(.data, .outcome, .treatment, .formula_rhs, .estimand = c('cfmean', 'ATE'), .weights, .repweights, .type = c('JK1', 'JK2'), .by = NULL) {
<- enquo(.treatment)
.treatment <- enquo(.weights)
.weights <- select(.data, {{.repweights}}) |> names()
.repweights <- enquo(.by)
.by # scale parameterの設定
<- case_when(
scale == 'JK1' ~ (length(.repweights) - 1) / length(.repweights),
.type == 'JK2' ~ 1
.type
)# formulaの左辺に.outcomeを追加
<- rlang::`f_lhs<-`(.formula_rhs, ensym(.outcome))
.formula
# OLS推定
<-
fit ::feols(
fixest
.formula,data = .data,
combine.quick = FALSE,
weights = str_c('~', rlang::as_name(.weights)) |> as.formula()
)# 点推定値
<-
res_point_estimate bind_rows(
::augment(fit, newdata = .data |> mutate(!!.treatment := 0)),
broom::augment(fit, newdata = .data |> mutate(!!.treatment := 1)),
broom|>
) group_by(pick(!!.treatment, !!.by)) |>
summarise(
estimate = weighted.mean(.fitted, !!.weights),
.groups = 'drop'
|>
) mutate(estimand = 'cfmean')
if (.estimand == 'ATE') {
<-
res_point_estimate |>
res_point_estimate mutate(estimand = 'ATE') |>
pivot_wider(names_from = !!.treatment, values_from = estimate) |>
mutate(estimate = `1` - `0`) |>
select(!c(`0`, `1`))
}
# 各replicate weightを使ってOLS推定し、jackknifeサンプルにおける点推定値を計算する
<-
fit_jack ::future_map(
furrr.progress = TRUE, .options = furrr::furrr_options(seed = TRUE), \(w) {
.repweights, gc()
# weightが0のデータを削除
<- .data |> filter(!!ensym(w) != 0)
df_jack # OLSで推定
<-
fit ::feols(
fixest
.formula,data = df_jack,
weights = str_c('~', w) |> as.formula(),
combine.quick = FALSE
)# Y^1とY^0を計算
<-
res_point_estimate_jack bind_rows(
::augment(fit, newdata = df_jack |> mutate(!!.treatment := 0)),
broom::augment(fit, newdata = df_jack |> mutate(!!.treatment := 1)),
broom|>
) group_by(pick(!!.treatment, !!.by)) |>
summarise(
estimate = weighted.mean(.fitted, !!ensym(w)),
.groups = 'drop'
|>
) mutate(estimand = 'cfmean')
if (.estimand == 'ATE') {
<-
res_point_estimate_jack |>
res_point_estimate_jack mutate(estimand = 'ATE') |>
pivot_wider(names_from = !!.treatment, values_from = estimate) |>
mutate(estimate = `1` - `0`) |>
select(!c(`0`, `1`))
}
res_point_estimate_jack
}
)# 結果の統合
<-
results |>
fit_jack bind_rows(.id = 'no') |>
rename(estimate_jack = estimate) |>
# 点推定値をくっつける
left_join(res_point_estimate |> rename(estimate_all = estimate))
# jackknife標準誤差の計算
if (.estimand == 'cfmean') {
<-
results |>
results group_by(pick(estimand, !!.treatment, !!.by)) |>
summarise(
estimate = mean(estimate_all),
std.error = sqrt(scale * sum((estimate_all - estimate_jack)^2, na.rm = TRUE)),
.groups = 'drop'
)
}else if (.estimand == 'ATE') {
<-
results |>
results group_by(pick(estimand, !!.by)) |>
summarise(
estimate = mean(estimate_all),
std.error = sqrt(scale * sum((estimate_all - estimate_jack)^2, na.rm = TRUE)),
.groups = 'drop'
)
}
results }
推定結果
# 並列化
::plan('multisession')
future
|>
df filter(age >= 26) |>
piaac_ATE_jack(
.outcome = pvlit1,
.treatment = univ,
.formula_rhs = ~ 1 | univ^gender^agegroup,
.estimand = 'ATE',
.weights = sampling_weight,
.repweights = matches('spfwt'),
.type = 'JK2',
.by = c(gender, agegroup)
|>
) kable()
estimand | gender | agegroup | estimate | std.error |
---|---|---|---|---|
ATE | Male | 26-30 | 26.590 | 4.692 |
ATE | Male | 31-35 | 24.986 | 4.766 |
ATE | Male | 36-40 | 28.109 | 3.835 |
ATE | Male | 41-45 | 28.916 | 3.917 |
ATE | Male | 46-50 | 30.594 | 4.789 |
ATE | Male | 51-55 | 35.483 | 5.616 |
ATE | Male | 56-60 | 38.366 | 5.703 |
ATE | Male | 61-65 | 24.382 | 4.711 |
ATE | Female | 26-30 | 22.546 | 3.546 |
ATE | Female | 31-35 | 24.534 | 3.581 |
ATE | Female | 36-40 | 27.753 | 4.466 |
ATE | Female | 41-45 | 34.266 | 4.167 |
ATE | Female | 46-50 | 23.310 | 4.708 |
ATE | Female | 51-55 | 32.056 | 5.081 |
ATE | Female | 56-60 | 26.026 | 4.619 |
ATE | Female | 61-65 | 40.169 | 10.041 |
10個のPVに対して一括で推定
# 読解力PVの変数名を取得
<- df |> select(matches('pvlit')) |> names()
pvs
<-
result map(
pvs,piaac_ATE_jack(
\(x) .data = df |> filter(age >= 26),
# mapで回すときは`!!`を使う
.outcome = !!x,
.treatment = univ,
.formula_rhs = ~ 1 | univ^gender^agegroup,
.estimand = 'ATE',
.weights = sampling_weight,
.repweights = matches('spfwt'),
.type = 'JK2',
.by = gender
|>
)) bind_rows(.id = 'pv')
result
# A tibble: 20 × 5
pv estimand gender estimate std.error
<chr> <chr> <fct> <dbl> <dbl>
1 1 ATE Male 29.4 1.61
2 1 ATE Female 29.2 2.01
3 2 ATE Male 29.8 1.78
4 2 ATE Female 29.7 1.98
5 3 ATE Male 29.4 1.76
6 3 ATE Female 29.6 2.13
7 4 ATE Male 32.2 1.73
8 4 ATE Female 32.3 2.16
9 5 ATE Male 30.6 1.71
10 5 ATE Female 29.8 1.82
11 6 ATE Male 30.5 1.67
12 6 ATE Female 29.7 2.06
13 7 ATE Male 30.4 1.78
14 7 ATE Female 31.0 1.80
15 8 ATE Male 30.0 1.84
16 8 ATE Female 31.2 1.91
17 9 ATE Male 32.1 1.66
18 9 ATE Female 31.3 2.17
19 10 ATE Male 30.8 1.60
20 10 ATE Female 32.7 1.81
結果の統合
|>
result pool_rubin(term = c(estimand, gender)) |>
kable()
estimand | gender | M | estimate | std.error | conf.low | conf.high |
---|---|---|---|---|---|---|
ATE | Male | 10 | 30.526 | 1.996 | 26.613 | 34.439 |
ATE | Female | 10 | 30.643 | 2.364 | 26.011 | 35.276 |
参考文献
Jakubowski, Maciej & Artur Pokropek, 2019, “piaactools: A program for data analysis with PIAAC data,” The Stata Journal, 19(1): 112-128. https://doi.org/10.1177/1536867X19830909
References
Buuren, Stef van. 2018. Flexible Imputation of Missing Data. 2nd ed. New York: Chapman & Hall/CRC. https://stefvanbuuren.name/fimd/.
Hernán, Miguel A., and James M. Robins. 2020. Causal Inference: What If. Boca Raton: Chapman & Hall/CRC. https://miguelhernan.org/whatifbook.
Robins, James. 1986. “A New Approach to Causal Inference in Mortality Studies with a Sustained Exposure Period—Application to Control of the Healthy Worker Survivor Effect.” Mathematical Modelling 7 (9): 1393–1512. https://doi.org/10.1016/0270-0255(86)90088-6.