『データ可視化学入門』をRで書く 1.1 データを可視化するということ

『データ可視化学入門』をRで書く

この記事はR言語 Advent Calendar 2023 シリーズ2の1日目の記事です。
qiita.com

はじめに

2023年の12月に『指標・特徴量の設計から始めるデータ可視化学入門データを洞察につなげる技術』という本が出まして、これがなかなか良い本でした。
いわゆるグラフやチャートについての本ですが、よくある棒グラフや折れ線グラフだけでなく、箱髭図やネットワーク図までカバーされていて勉強になります。

指標・特徴量の設計から始めるデータ可視化学入門データを洞察につなげる技術

作者:江崎貴裕
ソシム

Amazon

グラフ描画のためのコードもgithubで公開されていてすぐに実践できます。

github.com

……しかし、このコードはPyhtonなのですよねぇ。
いやいや、可視化ならRでしょ！

というわけで。『データ可視化学入門』をRで書いてみます。

とりあえず第3章まで、節ごとに記事を分けて投稿します。

おことわり

解説は書きません。ぜひ書籍の方をお読みください。
Pythonの完コピではなく、Rらしい書き方を優先しています。
配色や目盛り線は再現は目指しません。
画像ファイルとして出力する部分は割愛しています。
書籍とgithubのコードで微妙に異なる箇所は極力書籍に合わせています。
各グラフのコードのみをコピペで動かせるようにいちいちlibraryを呼んでます。
ggplot2のみでいい箇所も面倒なのでtidyverseでまとめて呼びだしています。
まちがいやより良い書き方があったらご指摘ください。

1.1 データを可視化するということ

1.1.3 日本の総人口の推移

##### 1.1.3 #####
library(tidyverse)

data.frame(
  year = 1990:2022,
  population = c(
    1.23611, 1.24101, 1.24567, 1.24938, 1.25265, 1.2557, 1.25859, 1.26157,
    1.26472, 1.26667, 1.26926, 1.27316, 1.27486, 1.27694, 1.27787, 1.27768,
    1.27901, 1.28033, 1.28084, 1.28032, 1.28057, 1.27834, 1.27593, 1.27414,
    1.27237, 1.27095, 1.27042, 1.26919, 1.26749, 1.26555, 1.26146, 1.25502,
    1.24947
    )
  ) |>
  ggplot(aes(x = year, y = population)) +
  geom_line() +
  geom_point() +
  labs(
    x = "西暦",
    y = "日本の総人口 [億人]",
    title = "時間遷移を折れ線グラフで描画する"
    ) +
  theme(aspect.ratio = 3/5) #縦横比

1.1.4 散布図で見る総人口

##### 1.1.4 #####
library(tidyverse)

data.frame(
  year = 1990:2022,
  population = c(
    1.23611, 1.24101, 1.24567, 1.24938, 1.25265, 1.2557, 1.25859, 1.26157,
    1.26472, 1.26667, 1.26926, 1.27316, 1.27486, 1.27694, 1.27787, 1.27768,
    1.27901, 1.28033, 1.28084, 1.28032, 1.28057, 1.27834, 1.27593, 1.27414,
    1.27237, 1.27095, 1.27042, 1.26919, 1.26749, 1.26555, 1.26146, 1.25502,
    1.24947
    )
  ) |>
  mutate(next_year_population = lead(population)) |> # 次の年の人口列を追加
  drop_na() |> # 最後の行を削除（次の年の値がないため
  ggplot(aes(x = population, y = next_year_population)) +
  geom_point() +
  labs(
    x = "ある年の日本の総人口 [億人]",
    y = "次の年の日本の総人口 [億人]",
    title = "日本の総人口の前年比較"
    ) +
  coord_cartesian(xlim = c(1.23, 1.29), ylim = c(1.23, 1.29)) +
  theme(aspect.ratio = 1) #縦横比

1.1.5 折れ線グラフと散布図による可視化例その2

##### 1.1.5 #####
library(tidyverse)
library(patchwork)

# データの定義
data <- data.frame(
  t = 0:100,
  x_t = c(
    0.2, 0.64, 0.9216, 0.28901376, 0.821939226, 0.585420539, 0.970813326, 
    0.113339247, 0.401973849, 0.961563495, 0.14783656, 0.503923646, 0.99993842,
    0.000246305, 0.000984976, 0.003936025, 0.015682131, 0.061744808,
    0.231729548, 0.712123859, 0.820013873, 0.590364483, 0.967337041,
    0.126384362, 0.441645421, 0.986378972, 0.053741981, 0.203415122,
    0.648149641, 0.912206736, 0.320342428, 0.870892628, 0.449754634,
    0.989901613, 0.039985639, 0.153547151, 0.519881693, 0.998418873,
    0.006314507, 0.025098538, 0.097874404, 0.35318002, 0.913775574, 0.315159096,
    0.863335361, 0.471949661, 0.996852714, 0.012549522, 0.049568127,
    0.188444511, 0.611732709, 0.950063207, 0.189772438, 0.61503544, 0.94706739,
    0.200522995, 0.641254094, 0.920189124, 0.293764402, 0.829867512,
    0.564749697, 0.983229907, 0.065955428, 0.246421239, 0.742791249,
    0.764209638, 0.720773069, 0.805037009, 0.627809693, 0.934658729,
    0.244287156, 0.738443765, 0.772578283, 0.702804318, 0.835481634,
    0.549808293, 0.990076536, 0.039299956, 0.151021877, 0.512857079,
    0.999338782, 0.002643123, 0.010544547, 0.041733437, 0.15996703, 0.537510318,
    0.994371904, 0.022385682, 0.087538253, 0.319501229, 0.869680775, 0.4533445,
    0.991293057, 0.034524528, 0.13333034, 0.462213442, 0.994288704, 0.022714708,
    0.088795, 0.323641792, 0.87559113
    )
  ) |>
  mutate(lag_x_t = dplyr::lag(x_t))

p1 <- data |>
  ggplot(aes(x = t, y = x_t)) +
  geom_line() +
  geom_point() +
  labs(
    x = expression(italic(t)),
    y = expression(italic({x[t]})),
    title = "折れ線グラフによる可視化"
    ) +
  theme(aspect.ratio = 4/5) #縦横比

p2 <- data |>
  drop_na() |>
  ggplot(aes(x = lag_x_t, y = x_t)) +
  geom_point() +
  labs(
    x = expression(italic({x[t]})),
    y = expression(italic({x[t+1]})),
    title = "散布図による可視化"
    ) +
  theme(aspect.ratio = 4/5) #縦横比

# 描画レイアウトの設定と表示
p1 + p2

1.1.6 2個体間の類似度のスコア化の仮想的な例

##### 1.1.6 #####
library(tidyverse)
library(mvtnorm)

# 個体の位置データ
set.seed(0)
individual <- matrix(abs(rnorm(200)), ncol = 2)

# 個体1のデータ生成
individual1 <- individual[,1] / sum(individual[, 1]) 

# 個体2のデータ生成
individual2 <- individual[, 1] * 0.6 + individual[, 2] * 0.4
individual2 <- individual2 / sum(individual2)

df <- data.frame(
  個体X = individual1,
  個体Y = individual2
  )

##### 1.1.6.1 #####
df |>
  pivot_longer(everything()) |>
  mutate(X2 = rep(c(1:100), 2)) |>
  ggplot(aes(x = X2, y = value, fill = name)) +
  geom_col() +
  scale_fill_manual(values = c( "red","blue")) +
  labs(
    x = "滞在した場所ID",
    y = "滞在時間割合",
    title = "2個体の「行動」がどれだけ似ているか",
    fill = ""
    ) +
  theme(aspect.ratio = 1/2) #縦横比

##### 1.1.6.2 #####
df |>
  ggplot(aes(x = 個体X, y = 個体Y)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "個体Xの行動データ", y = "個体Yの行動データ") +
  coord_fixed() # アスペクト比を等しく設定 

# ヒートマップを生成
# 2変量正規分布からデータ生成
x <- rmvnorm(
  n = 100,
  mean = c(0, 0),
  sigma = matrix(c(1, 0.8, 0.8, 1), nrow = 2)
  ) 

# 正規化関数
my_std <- function(x) {
  (x - min(x)) / (max(x) - min(x))
}

# 正規化実行
x1 <- my_std(x[, 1])
x2 <- my_std(x[, 2])

# ヒートマップ用のデータ分割
heatmap1 <- matrix(x1, nrow = 10, ncol = 10)
heatmap2 <- matrix(x2, nrow = 10, ncol = 10)

##### 1.1.6.3 #####
# ヒートマップ描画
hm1 <- heatmap1 |>
  as.data.frame() |>
  pivot_longer(everything()) |>
  mutate(name2 = paste0("W", sort(rep(1:10, 10)))) |>
  ggplot(aes(x = name, y = name2, fill = value)) +
  geom_tile() +
  scale_fill_gradientn("value", colours = gray.colors(12, start = 0, end = 1, gamma = 2.2, alpha = 1)) +
  theme(
    aspect.ratio = 1,
    axis.text = element_blank(),
    axis.title = element_blank(), 
    axis.ticks = element_blank(),     
    legend.position = "none"
  ) +
  labs(title = "個体Xの見た目データ")

hm2 <- heatmap2 |>
  as.data.frame() |>
  pivot_longer(everything()) |>
  mutate(name2 = paste0("W", sort(rep(1:10, 10)))) |>
  ggplot(aes(x = name, y = name2, fill = value)) +
  geom_tile() +
  scale_fill_gradientn("value", colours = gray.colors(12, start = 0, end = 1, gamma = 2.2, alpha = 1)) +
  theme(
    aspect.ratio = 1,
    axis.text = element_blank(),
    axis.title = element_blank(), 
    axis.ticks = element_blank(),     
    legend.position = "none"
  ) +
  labs(title = "個体Yの見た目データ")

hm1 + hm2

##### 1.1.6.4 #####
data.frame(x1, x2) |>
  ggplot(aes(x = x1, y = x2)) +
  geom_point(alpha = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(x = "個体Xの見た目データ", y = "個体Yの見た目データ") +
  coord_fixed() + # アスペクト比を等しく設定 
  scale_x_continuous(limits = c(0,1)) +
  scale_y_continuous(limits = c(0,1))

1.1.7 二つの類似度スコアの関係

##### 1.1.7 #####
library(tidyverse)
library(MASS)

set.seed(0)
n <- 190     # データ点の数
low <- -0.45 # データの範囲の下限
high <- 0.95 # データの範囲の上限
rho <- 0.7   # 相関係数
mean <- c(0, 0) # 平均（二変数）

Sigma <- matrix(c(1, rho, rho, 1), nrow=2) # 共分散行列

data <- mvrnorm(n = n-1, mu = mean, Sigma = Sigma) # 多変量正規分布からデータ生成

x <- data[,1]

y <- data[,2]

x <- low + (high - low) * (x - min(x)) / (max(x) - min(x)) # xデータの正規化

y <- low + (high - low) * (y - min(y)) / (max(y) - min(y)) # yデータの正規化

x <- c(x, 0.80) # 特定の点を追加

y <- c(y, 0.72) # 特定の点を追加

df <- data.frame(x = x, y = y)

df |>
  ggplot(aes(x = x, y = y)) +
  geom_point(alpha = 0.5) + # 散布図の描画
  geom_smooth(method = "lm", se = FALSE) + # 回帰直線の描画
  geom_point(aes(x = 0.80, y = 0.72), colour = "red", size = 2) + # 特定の点を赤色で描画
  coord_fixed() + # アスペクト比を等しく設定
  annotate(
    "text",
    x=-0.25,
    y=0.75,
    size = 4,
    label=paste0("r = ", round(cor(df$x, df$y), 2))
    ) +
  labs(
    x = "個体間の見た目類似度スコア",
    y = "個体間の行動類似度スコア",
    title = "2つの類似度スコアの関係"
  )