Rで連続変数のカテゴリ化

R
Author

Kentaro Kamada

Published

November 25, 2024

データ

  • いま1から100までの連続変数があるとする
library(tidyverse)

data <- 
  tibble(
    x = 1:100
  )

data |> print(n = 100)
# A tibble: 100 × 1
        x
    <int>
  1     1
  2     2
  3     3
  4     4
  5     5
  6     6
  7     7
  8     8
  9     9
 10    10
 11    11
 12    12
 13    13
 14    14
 15    15
 16    16
 17    17
 18    18
 19    19
 20    20
 21    21
 22    22
 23    23
 24    24
 25    25
 26    26
 27    27
 28    28
 29    29
 30    30
 31    31
 32    32
 33    33
 34    34
 35    35
 36    36
 37    37
 38    38
 39    39
 40    40
 41    41
 42    42
 43    43
 44    44
 45    45
 46    46
 47    47
 48    48
 49    49
 50    50
 51    51
 52    52
 53    53
 54    54
 55    55
 56    56
 57    57
 58    58
 59    59
 60    60
 61    61
 62    62
 63    63
 64    64
 65    65
 66    66
 67    67
 68    68
 69    69
 70    70
 71    71
 72    72
 73    73
 74    74
 75    75
 76    76
 77    77
 78    78
 79    79
 80    80
 81    81
 82    82
 83    83
 84    84
 85    85
 86    86
 87    87
 88    88
 89    89
 90    90
 91    91
 92    92
 93    93
 94    94
 95    95
 96    96
 97    97
 98    98
 99    99
100   100

カテゴリ化

  • 連続変数を0-25%, 25%-50%, 50%-75%, 75%-100%の4つのカテゴリに分ける
  • cutquantileを組み合わせるとうまくいく
  • 1から25がQ1, 26から50がQ2, 51から75がQ3, 76から100がQ4になる
data |> 
  mutate(
    x_cut = cut(
      x,
      breaks = quantile(x, probs = c(0, 1/4, 2/4, 3/4, 1)),
      include.lowest = TRUE
    ),
    x_cut2 = cut(
      x,
      breaks = quantile(x, probs = c(0, 1/4, 2/4, 3/4, 1)),
      # ラベルを指定することもできる
      labels = c("Q1", "Q2", "Q3", "Q4"),
      include.lowest = TRUE
    ),
    # 区間を左端で閉じるか右端で閉じるかを指定できる
    x_cut3 = cut(
      x,
      breaks = quantile(x, probs = c(0, 1/4, 2/4, 3/4, 1)),
      include.lowest = TRUE,
      right = FALSE
    )
  ) |> 
  print(n = 100)
# A tibble: 100 × 4
        x x_cut       x_cut2 x_cut3     
    <int> <fct>       <fct>  <fct>      
  1     1 [1,25.8]    Q1     [1,25.8)   
  2     2 [1,25.8]    Q1     [1,25.8)   
  3     3 [1,25.8]    Q1     [1,25.8)   
  4     4 [1,25.8]    Q1     [1,25.8)   
  5     5 [1,25.8]    Q1     [1,25.8)   
  6     6 [1,25.8]    Q1     [1,25.8)   
  7     7 [1,25.8]    Q1     [1,25.8)   
  8     8 [1,25.8]    Q1     [1,25.8)   
  9     9 [1,25.8]    Q1     [1,25.8)   
 10    10 [1,25.8]    Q1     [1,25.8)   
 11    11 [1,25.8]    Q1     [1,25.8)   
 12    12 [1,25.8]    Q1     [1,25.8)   
 13    13 [1,25.8]    Q1     [1,25.8)   
 14    14 [1,25.8]    Q1     [1,25.8)   
 15    15 [1,25.8]    Q1     [1,25.8)   
 16    16 [1,25.8]    Q1     [1,25.8)   
 17    17 [1,25.8]    Q1     [1,25.8)   
 18    18 [1,25.8]    Q1     [1,25.8)   
 19    19 [1,25.8]    Q1     [1,25.8)   
 20    20 [1,25.8]    Q1     [1,25.8)   
 21    21 [1,25.8]    Q1     [1,25.8)   
 22    22 [1,25.8]    Q1     [1,25.8)   
 23    23 [1,25.8]    Q1     [1,25.8)   
 24    24 [1,25.8]    Q1     [1,25.8)   
 25    25 [1,25.8]    Q1     [1,25.8)   
 26    26 (25.8,50.5] Q2     [25.8,50.5)
 27    27 (25.8,50.5] Q2     [25.8,50.5)
 28    28 (25.8,50.5] Q2     [25.8,50.5)
 29    29 (25.8,50.5] Q2     [25.8,50.5)
 30    30 (25.8,50.5] Q2     [25.8,50.5)
 31    31 (25.8,50.5] Q2     [25.8,50.5)
 32    32 (25.8,50.5] Q2     [25.8,50.5)
 33    33 (25.8,50.5] Q2     [25.8,50.5)
 34    34 (25.8,50.5] Q2     [25.8,50.5)
 35    35 (25.8,50.5] Q2     [25.8,50.5)
 36    36 (25.8,50.5] Q2     [25.8,50.5)
 37    37 (25.8,50.5] Q2     [25.8,50.5)
 38    38 (25.8,50.5] Q2     [25.8,50.5)
 39    39 (25.8,50.5] Q2     [25.8,50.5)
 40    40 (25.8,50.5] Q2     [25.8,50.5)
 41    41 (25.8,50.5] Q2     [25.8,50.5)
 42    42 (25.8,50.5] Q2     [25.8,50.5)
 43    43 (25.8,50.5] Q2     [25.8,50.5)
 44    44 (25.8,50.5] Q2     [25.8,50.5)
 45    45 (25.8,50.5] Q2     [25.8,50.5)
 46    46 (25.8,50.5] Q2     [25.8,50.5)
 47    47 (25.8,50.5] Q2     [25.8,50.5)
 48    48 (25.8,50.5] Q2     [25.8,50.5)
 49    49 (25.8,50.5] Q2     [25.8,50.5)
 50    50 (25.8,50.5] Q2     [25.8,50.5)
 51    51 (50.5,75.2] Q3     [50.5,75.2)
 52    52 (50.5,75.2] Q3     [50.5,75.2)
 53    53 (50.5,75.2] Q3     [50.5,75.2)
 54    54 (50.5,75.2] Q3     [50.5,75.2)
 55    55 (50.5,75.2] Q3     [50.5,75.2)
 56    56 (50.5,75.2] Q3     [50.5,75.2)
 57    57 (50.5,75.2] Q3     [50.5,75.2)
 58    58 (50.5,75.2] Q3     [50.5,75.2)
 59    59 (50.5,75.2] Q3     [50.5,75.2)
 60    60 (50.5,75.2] Q3     [50.5,75.2)
 61    61 (50.5,75.2] Q3     [50.5,75.2)
 62    62 (50.5,75.2] Q3     [50.5,75.2)
 63    63 (50.5,75.2] Q3     [50.5,75.2)
 64    64 (50.5,75.2] Q3     [50.5,75.2)
 65    65 (50.5,75.2] Q3     [50.5,75.2)
 66    66 (50.5,75.2] Q3     [50.5,75.2)
 67    67 (50.5,75.2] Q3     [50.5,75.2)
 68    68 (50.5,75.2] Q3     [50.5,75.2)
 69    69 (50.5,75.2] Q3     [50.5,75.2)
 70    70 (50.5,75.2] Q3     [50.5,75.2)
 71    71 (50.5,75.2] Q3     [50.5,75.2)
 72    72 (50.5,75.2] Q3     [50.5,75.2)
 73    73 (50.5,75.2] Q3     [50.5,75.2)
 74    74 (50.5,75.2] Q3     [50.5,75.2)
 75    75 (50.5,75.2] Q3     [50.5,75.2)
 76    76 (75.2,100]  Q4     [75.2,100] 
 77    77 (75.2,100]  Q4     [75.2,100] 
 78    78 (75.2,100]  Q4     [75.2,100] 
 79    79 (75.2,100]  Q4     [75.2,100] 
 80    80 (75.2,100]  Q4     [75.2,100] 
 81    81 (75.2,100]  Q4     [75.2,100] 
 82    82 (75.2,100]  Q4     [75.2,100] 
 83    83 (75.2,100]  Q4     [75.2,100] 
 84    84 (75.2,100]  Q4     [75.2,100] 
 85    85 (75.2,100]  Q4     [75.2,100] 
 86    86 (75.2,100]  Q4     [75.2,100] 
 87    87 (75.2,100]  Q4     [75.2,100] 
 88    88 (75.2,100]  Q4     [75.2,100] 
 89    89 (75.2,100]  Q4     [75.2,100] 
 90    90 (75.2,100]  Q4     [75.2,100] 
 91    91 (75.2,100]  Q4     [75.2,100] 
 92    92 (75.2,100]  Q4     [75.2,100] 
 93    93 (75.2,100]  Q4     [75.2,100] 
 94    94 (75.2,100]  Q4     [75.2,100] 
 95    95 (75.2,100]  Q4     [75.2,100] 
 96    96 (75.2,100]  Q4     [75.2,100] 
 97    97 (75.2,100]  Q4     [75.2,100] 
 98    98 (75.2,100]  Q4     [75.2,100] 
 99    99 (75.2,100]  Q4     [75.2,100] 
100   100 (75.2,100]  Q4     [75.2,100] 
  • 3分割だとこんな感じ
  • 100人を3分割すると1人あまるので、34人目までが最初の区間となっている
  • right = FALSEにすると、33人目までが最初の区間となり、最後の区間が34人になる
data |> 
  mutate(
    x_cut = cut(
      x,
      breaks = quantile(x, probs = c(0, 1/3, 2/3, 1)),
      include.lowest = TRUE
    ),
    x_cut2 = cut(
      x,
      breaks = quantile(x, probs = c(0, 1/3, 2/3, 1)),
      include.lowest = TRUE,
      right = FALSE
    ),
  ) |> 
  print(n = Inf)
# A tibble: 100 × 3
        x x_cut    x_cut2  
    <int> <fct>    <fct>   
  1     1 [1,34]   [1,34)  
  2     2 [1,34]   [1,34)  
  3     3 [1,34]   [1,34)  
  4     4 [1,34]   [1,34)  
  5     5 [1,34]   [1,34)  
  6     6 [1,34]   [1,34)  
  7     7 [1,34]   [1,34)  
  8     8 [1,34]   [1,34)  
  9     9 [1,34]   [1,34)  
 10    10 [1,34]   [1,34)  
 11    11 [1,34]   [1,34)  
 12    12 [1,34]   [1,34)  
 13    13 [1,34]   [1,34)  
 14    14 [1,34]   [1,34)  
 15    15 [1,34]   [1,34)  
 16    16 [1,34]   [1,34)  
 17    17 [1,34]   [1,34)  
 18    18 [1,34]   [1,34)  
 19    19 [1,34]   [1,34)  
 20    20 [1,34]   [1,34)  
 21    21 [1,34]   [1,34)  
 22    22 [1,34]   [1,34)  
 23    23 [1,34]   [1,34)  
 24    24 [1,34]   [1,34)  
 25    25 [1,34]   [1,34)  
 26    26 [1,34]   [1,34)  
 27    27 [1,34]   [1,34)  
 28    28 [1,34]   [1,34)  
 29    29 [1,34]   [1,34)  
 30    30 [1,34]   [1,34)  
 31    31 [1,34]   [1,34)  
 32    32 [1,34]   [1,34)  
 33    33 [1,34]   [1,34)  
 34    34 [1,34]   [34,67) 
 35    35 (34,67]  [34,67) 
 36    36 (34,67]  [34,67) 
 37    37 (34,67]  [34,67) 
 38    38 (34,67]  [34,67) 
 39    39 (34,67]  [34,67) 
 40    40 (34,67]  [34,67) 
 41    41 (34,67]  [34,67) 
 42    42 (34,67]  [34,67) 
 43    43 (34,67]  [34,67) 
 44    44 (34,67]  [34,67) 
 45    45 (34,67]  [34,67) 
 46    46 (34,67]  [34,67) 
 47    47 (34,67]  [34,67) 
 48    48 (34,67]  [34,67) 
 49    49 (34,67]  [34,67) 
 50    50 (34,67]  [34,67) 
 51    51 (34,67]  [34,67) 
 52    52 (34,67]  [34,67) 
 53    53 (34,67]  [34,67) 
 54    54 (34,67]  [34,67) 
 55    55 (34,67]  [34,67) 
 56    56 (34,67]  [34,67) 
 57    57 (34,67]  [34,67) 
 58    58 (34,67]  [34,67) 
 59    59 (34,67]  [34,67) 
 60    60 (34,67]  [34,67) 
 61    61 (34,67]  [34,67) 
 62    62 (34,67]  [34,67) 
 63    63 (34,67]  [34,67) 
 64    64 (34,67]  [34,67) 
 65    65 (34,67]  [34,67) 
 66    66 (34,67]  [34,67) 
 67    67 (34,67]  [67,100]
 68    68 (67,100] [67,100]
 69    69 (67,100] [67,100]
 70    70 (67,100] [67,100]
 71    71 (67,100] [67,100]
 72    72 (67,100] [67,100]
 73    73 (67,100] [67,100]
 74    74 (67,100] [67,100]
 75    75 (67,100] [67,100]
 76    76 (67,100] [67,100]
 77    77 (67,100] [67,100]
 78    78 (67,100] [67,100]
 79    79 (67,100] [67,100]
 80    80 (67,100] [67,100]
 81    81 (67,100] [67,100]
 82    82 (67,100] [67,100]
 83    83 (67,100] [67,100]
 84    84 (67,100] [67,100]
 85    85 (67,100] [67,100]
 86    86 (67,100] [67,100]
 87    87 (67,100] [67,100]
 88    88 (67,100] [67,100]
 89    89 (67,100] [67,100]
 90    90 (67,100] [67,100]
 91    91 (67,100] [67,100]
 92    92 (67,100] [67,100]
 93    93 (67,100] [67,100]
 94    94 (67,100] [67,100]
 95    95 (67,100] [67,100]
 96    96 (67,100] [67,100]
 97    97 (67,100] [67,100]
 98    98 (67,100] [67,100]
 99    99 (67,100] [67,100]
100   100 (67,100] [67,100]

quantileのアルゴリズム

  • 1から100を4分割すると、分位点は直感的に25,50,75になると思われるが、上記の結果ではそうなっていない
  • Rのquantile関数は9つのアルゴリズムがあり、アルゴリズムごとに結果が異なりうる
    • デフォルトはtype = 7
  • 詳細はstats::quantileのヘルプを参照
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1))
    0%    25%    50%    75%   100% 
  1.00  25.75  50.50  75.25 100.00 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 7)
    0%    25%    50%    75%   100% 
  1.00  25.75  50.50  75.25 100.00 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 1)
  0%  25%  50%  75% 100% 
   1   25   50   75  100 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 2)
   0%   25%   50%   75%  100% 
  1.0  25.5  50.5  75.5 100.0 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 3)
  0%  25%  50%  75% 100% 
   1   25   50   75  100 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 4)
  0%  25%  50%  75% 100% 
   1   25   50   75  100 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 5)
   0%   25%   50%   75%  100% 
  1.0  25.5  50.5  75.5 100.0 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 6)
    0%    25%    50%    75%   100% 
  1.00  25.25  50.50  75.75 100.00 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 8)
       0%       25%       50%       75%      100% 
  1.00000  25.41667  50.50000  75.58333 100.00000 
quantile(1:100, probs = c(0, 1/4, 2/4, 3/4, 1), type = 9)
      0%      25%      50%      75%     100% 
  1.0000  25.4375  50.5000  75.5625 100.0000