『データ可視化学入門』をRで書く 3.3 2変数の関係をとらえる

3.3 2変数の関係をとらえる

この記事はR言語 Advent Calendar 2023 シリーズ2の11日目の記事です。
qiita.com

3.3.1 ペアプロットによる分布の可視化

##### 3.3.1 #####
library(tidyverse)
library(GGally)
library(palmerpenguins)

penguins |>
  ggpairs(
    columns = c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"),
    columnLabels = c("くちばしの長さ [mm]", "くちばしの厚さ [mm]", "ひれの長さ [mm]", "体重 [g]"),
    aes(color = species),
    diag = list(continuous = wrap("densityDiag", alpha = .5))
    )
3.3.1

3.3.2 相関を視るための散布図

##### 3.3.2 #####
library(tidyverse)
library(palmerpenguins)

penguins |>
  ggplot(aes(x = bill_length_mm, y = body_mass_g, color = species, fill = species)) +
  geom_point(alpha=0.3) +
  geom_smooth(method = "lm") +
  labs(x = "くちばしの長さ [mm]", y = "体重 [g]") +
  theme(aspect.ratio = 1)
3.3.2

3.3.3 y = x との比較

nestでもっと効率よく書けそうだが、よく分からなかったのでmapで。

##### 3.3.3 #####
library(tidyverse)
library(palmerpenguins)
library(rsample)
library(MLmetrics)

# データの準備
pen <- penguins |>
  select(body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm, species) |>
  drop_na()

pen_list <- split(pen, pen$species)

# 分割
pen_split <- pen_list |>
  map(\(df) initial_split(df, prop = 0.8, strata = "body_mass_g"))
  
pen_train <-  pen_split |>
  map(\(df) training(df))

pen_test <-  pen_split |>
  map(\(df) testing(df))

# 学習
pen_lm <- pen_train |>
  map(\(df) lm(body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm, data = df))

# 検証
pen_pred <- pen_lm |>
  map2(pen_test, \(x, y) predict(x, newdata = y))

# 評価(RMSE)
pen_true <- pen_test |> map(pluck("body_mass_g"))
pen_pred |>
  map2_dbl(pen_true, \(y_pred, y_true) RMSE(y_pred, y_true))
# Adelie Chinstrap    Gentoo 
# 336.5512  305.2450  313.6054 

# 散布図の作成
data.frame(
  y_true = pen_true |> unlist(),
  y_pred = pen_pred |> unlist()
  ) |>
  rownames_to_column(var = "species") |>
  mutate(species = str_remove(species, "[0-9]+")) |>
  ggplot(aes(x = y_true, y = y_pred, color = species)) +
    geom_point(alpha = 0.6) +
    labs(x = "実際の体重 [g]", y = "モデルによる予測 [g]", title = "線形回帰による予測") +
    geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = c(3000, 6000), ylim = c(3000, 6000)) +
  theme(aspect.ratio = 1)
3.3.3

3.3.4 y = x との比較:その他の例

疑似データ生成がややこしくて、間違ってたらゴメンナサイ

##### 3.3.4 #####
library(tidyverse)
library(patchwork)

set.seed(0)

num_users <- 100 # ユーザー数
replies_sent <- rep(0, num_users) # 送信したリプライ数
replies_received <- rep(0, num_users) # 受信したリプライ数

result <- map(
  1:num_users,
  ~{
    pattern <- sample(c('moderate', 'one_sided_few', 'one_sided_many'), 1) # 3つのパターンからランダムに選択
  if (pattern == 'moderate') {
    replies_sent <- round(runif(1, min = 10, max = 30)) # 10~30回の間でランダムに送信
    replies_received <- replies_sent + round(runif(1, min = -3, max = 3)) # 送信した回数±3回の間でランダムに受信
  } 
  else if (pattern == 'one_sided_few') {
    replies_sent <- round(runif(1, min = 0, max = 4)) # 0~3回の間でランダムに送信
    replies_received <- 0
  }
  else { # pattern == 'one_sided_many'
    replies_sent <- round(runif(1, min = 10, max = 30)) # 10~30回の間でランダムに送信
    replies_received <- 0
  }
  
  c(Sent = replies_sent, Received = replies_received)
}
)

df <- data.frame(do.call("rbind", result))

df2 <- df |>
  mutate(
    Sent = rev(Received),
    Received = rev(Sent)
    )

final_df <- bind_rows(df, df2)

# 体重データを生成
df_weights <- tibble(
  Weight1 = rnorm(50, mean = 60, sd = sqrt(2)), # 平均60、分散2の正規分布,
  Weight2 = Weight1 + rnorm(50, mean = -1, sd = sqrt(0.5)) # 平均-1、分散0.5の正規分布
)

# ggplotを用いたグラフ描画
p1 <- df_weights |>
  ggplot(aes(x = Weight1, y = Weight2)) + 
  geom_point(color = "orange") +
  geom_abline(intercept = 0, slope = 1, color = "black") +
  labs(
    title = "同じ量の2時点の比較",
    x = "実験開始時体重 [kg]",
    y = "一か月後体重 [kg]") +
  coord_cartesian(xlim = c(56, 64), ylim = c(56, 64)) +
  theme(aspect.ratio = 1)

p2 <- final_df |>
  ggplot(aes(x = Sent, y = Received)) + 
  geom_point(color = "lightblue") +
  geom_abline(intercept = 0, slope = 1, color = "black") +
  labs(
    title = "ペア間の双方向の量の比較",
    x = "送信したリプライ数",
    y = "受信したリプライ数"
    ) +
  coord_cartesian(xlim = c(0, 31), ylim = c(0, 31)) +
  theme(aspect.ratio = 1)

p1 + p2
3.3.4

3.3.5 サンプルサイズが大きいケース

##### 3.3.5 #####
library(tidyverse)
library(patchwork)

set.seed(0)

data <- bind_rows(
  tibble(# 相関の少ないデータの生成
    x = rnorm(100, 25, 10), # 平均25、分散10の正規分布
    y = rnorm(100, 75, 10)  # 平均75、分散10の正規分布
    ),
  tibble(# 相関の多いデータの生成
    x = rnorm(500, 50, 30), # 平均50、分散30の正規分布
    y = 0.5 * x + rnorm(500, 0, 10) # y = 0.5x + ノイズ
    ),
  tibble(
     x = rnorm(400, 25, 20), # 平均25、分散20の正規分布
     y = rnorm(400, 25, 30)  # 平均25、分散30の正規分布
     )
  )

# グラフ生成
p1 <- data |>
  ggplot(aes(x =x, y = y)) +
  geom_point() +
  coord_fixed() +
  labs(title = "不透明マーカー利用")

# 中央のサブプロットに統合データの半透明なマーカー散布図をプロット
p2 <- data |>
  ggplot(aes(x =x, y = y)) +
  geom_point(alpha = 0.1) +
  coord_fixed() +
  labs(title = "半透明マーカー利用")

# ヒートマップを生成
# みためがちょっと違うが2次元ヒストグラムなのでよし
p3 <- data |>
  ggplot(aes(x = x, y = y)) +
  stat_density2d(aes(fill = after_stat(density)), geom="tile", contour=FALSE, n = 20) +
  scale_fill_gradientn(colours = rev(rainbow(10)[1:8])) +
  coord_fixed() +
  labs(title = "ヒートマップ")

p1 + p2 +p3
3.3.5

3.3.6 バブルチャートによる情報提示

##### 3.3.6 #####
library(tidyverse)
library(ggrepel)
library(janitor)

# データの読み込み
df <- read_csv(
  "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/3%E7%AB%A0/data/bubble_chart_data.csv"
  ) |>
  clean_names() |>
  filter(x2021_yr2021 != "..") |>
  mutate(x2021_yr2021 = as.numeric(x2021_yr2021))

region_df <- read_csv(
  "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/3%E7%AB%A0/data/bubble_chart_region_data.csv"
  ) |>
  clean_names() 

# データの前処理
df_split <- df |>
  split(df$series_name)

names(df_split) <- c("gdp", "life_exp", "population")

# GDP, Life expectancy, Populationのデータをマージ
data <- df_split$gdp |>
  select(country_name, country_code, x2021_yr2021) |>
  left_join(
    df_split$life_exp |>
      select(country_name, x2021_yr2021),
    by = join_by(country_name),
    suffix = c("_GDP", "_Life_Expectancy")
    ) |>
  left_join(
    df_split$population |>
      select(country_name, x2021_yr2021),
    by = join_by(country_name)
    ) |>
  rename(
    x2021_yr2021_Population = x2021_yr2021
  ) |>
  left_join( #regionをCountry Codeでjoin
    region_df |>
      select(alpha_3, region),
    by = join_by(country_code == alpha_3)
    ) |>
  left_join( #regionをCountry_Nameでjoin
    region_df |>
      select(name, region),
    by = join_by(country_name == name)
  ) |>
  mutate( #region確定
    region = if_else(is.na(region.x), region.y, region.x),
    .keep = "unused"
    ) |>
  drop_na() |> #region無い行を削除
  mutate(# GDP per capita(千ドル)を計算
    GDP_per_capita = x2021_yr2021_GDP / x2021_yr2021_Population / 1000,
    x2021_yr2021_Population = x2021_yr2021_Population / 100000
  )

# 描画
data |>
  ggplot(aes(
    x = GDP_per_capita,
    y = x2021_yr2021_Life_Expectancy,
    color = region,
    size = x2021_yr2021_Population,
    label = country_name
    )) +
  geom_point(alpha = 0.6) +
  scale_size_area(max_size = 11) + #値と面積が比例するように
  geom_text_repel(size = 3) +
  scale_x_log10() +
  labs(
    title = "色や大きさで情報を付加する",
    x = "一人あたりGDP [1,000$](対数軸)",
    y = "平均寿命 [年]", 
    size = "人口(10万人)", 
    color = "地域"
    ) +
  theme(aspect.ratio = 1)
3.3.6

第3章3節は以上。