『データ可視化学入門』をRで書く 3.3 2変数の関係をとらえる
3.3 2変数の関係をとらえる
この記事はR言語 Advent Calendar 2023 シリーズ2の11日目の記事です。
qiita.com
3.3.1 ペアプロットによる分布の可視化
##### 3.3.1 ##### library(tidyverse) library(GGally) library(palmerpenguins) penguins |> ggpairs( columns = c("bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"), columnLabels = c("くちばしの長さ [mm]", "くちばしの厚さ [mm]", "ひれの長さ [mm]", "体重 [g]"), aes(color = species), diag = list(continuous = wrap("densityDiag", alpha = .5)) )
3.3.2 相関を視るための散布図
##### 3.3.2 ##### library(tidyverse) library(palmerpenguins) penguins |> ggplot(aes(x = bill_length_mm, y = body_mass_g, color = species, fill = species)) + geom_point(alpha=0.3) + geom_smooth(method = "lm") + labs(x = "くちばしの長さ [mm]", y = "体重 [g]") + theme(aspect.ratio = 1)
3.3.3 y = x との比較
nestでもっと効率よく書けそうだが、よく分からなかったのでmapで。
##### 3.3.3 ##### library(tidyverse) library(palmerpenguins) library(rsample) library(MLmetrics) # データの準備 pen <- penguins |> select(body_mass_g, bill_length_mm, bill_depth_mm, flipper_length_mm, species) |> drop_na() pen_list <- split(pen, pen$species) # 分割 pen_split <- pen_list |> map(\(df) initial_split(df, prop = 0.8, strata = "body_mass_g")) pen_train <- pen_split |> map(\(df) training(df)) pen_test <- pen_split |> map(\(df) testing(df)) # 学習 pen_lm <- pen_train |> map(\(df) lm(body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm, data = df)) # 検証 pen_pred <- pen_lm |> map2(pen_test, \(x, y) predict(x, newdata = y)) # 評価(RMSE) pen_true <- pen_test |> map(pluck("body_mass_g")) pen_pred |> map2_dbl(pen_true, \(y_pred, y_true) RMSE(y_pred, y_true)) # Adelie Chinstrap Gentoo # 336.5512 305.2450 313.6054 # 散布図の作成 data.frame( y_true = pen_true |> unlist(), y_pred = pen_pred |> unlist() ) |> rownames_to_column(var = "species") |> mutate(species = str_remove(species, "[0-9]+")) |> ggplot(aes(x = y_true, y = y_pred, color = species)) + geom_point(alpha = 0.6) + labs(x = "実際の体重 [g]", y = "モデルによる予測 [g]", title = "線形回帰による予測") + geom_abline(intercept = 0, slope = 1) + coord_cartesian(xlim = c(3000, 6000), ylim = c(3000, 6000)) + theme(aspect.ratio = 1)
3.3.4 y = x との比較:その他の例
疑似データ生成がややこしくて、間違ってたらゴメンナサイ
##### 3.3.4 ##### library(tidyverse) library(patchwork) set.seed(0) num_users <- 100 # ユーザー数 replies_sent <- rep(0, num_users) # 送信したリプライ数 replies_received <- rep(0, num_users) # 受信したリプライ数 result <- map( 1:num_users, ~{ pattern <- sample(c('moderate', 'one_sided_few', 'one_sided_many'), 1) # 3つのパターンからランダムに選択 if (pattern == 'moderate') { replies_sent <- round(runif(1, min = 10, max = 30)) # 10~30回の間でランダムに送信 replies_received <- replies_sent + round(runif(1, min = -3, max = 3)) # 送信した回数±3回の間でランダムに受信 } else if (pattern == 'one_sided_few') { replies_sent <- round(runif(1, min = 0, max = 4)) # 0~3回の間でランダムに送信 replies_received <- 0 } else { # pattern == 'one_sided_many' replies_sent <- round(runif(1, min = 10, max = 30)) # 10~30回の間でランダムに送信 replies_received <- 0 } c(Sent = replies_sent, Received = replies_received) } ) df <- data.frame(do.call("rbind", result)) df2 <- df |> mutate( Sent = rev(Received), Received = rev(Sent) ) final_df <- bind_rows(df, df2) # 体重データを生成 df_weights <- tibble( Weight1 = rnorm(50, mean = 60, sd = sqrt(2)), # 平均60、分散2の正規分布, Weight2 = Weight1 + rnorm(50, mean = -1, sd = sqrt(0.5)) # 平均-1、分散0.5の正規分布 ) # ggplotを用いたグラフ描画 p1 <- df_weights |> ggplot(aes(x = Weight1, y = Weight2)) + geom_point(color = "orange") + geom_abline(intercept = 0, slope = 1, color = "black") + labs( title = "同じ量の2時点の比較", x = "実験開始時体重 [kg]", y = "一か月後体重 [kg]") + coord_cartesian(xlim = c(56, 64), ylim = c(56, 64)) + theme(aspect.ratio = 1) p2 <- final_df |> ggplot(aes(x = Sent, y = Received)) + geom_point(color = "lightblue") + geom_abline(intercept = 0, slope = 1, color = "black") + labs( title = "ペア間の双方向の量の比較", x = "送信したリプライ数", y = "受信したリプライ数" ) + coord_cartesian(xlim = c(0, 31), ylim = c(0, 31)) + theme(aspect.ratio = 1) p1 + p2
3.3.5 サンプルサイズが大きいケース
##### 3.3.5 ##### library(tidyverse) library(patchwork) set.seed(0) data <- bind_rows( tibble(# 相関の少ないデータの生成 x = rnorm(100, 25, 10), # 平均25、分散10の正規分布 y = rnorm(100, 75, 10) # 平均75、分散10の正規分布 ), tibble(# 相関の多いデータの生成 x = rnorm(500, 50, 30), # 平均50、分散30の正規分布 y = 0.5 * x + rnorm(500, 0, 10) # y = 0.5x + ノイズ ), tibble( x = rnorm(400, 25, 20), # 平均25、分散20の正規分布 y = rnorm(400, 25, 30) # 平均25、分散30の正規分布 ) ) # グラフ生成 p1 <- data |> ggplot(aes(x =x, y = y)) + geom_point() + coord_fixed() + labs(title = "不透明マーカー利用") # 中央のサブプロットに統合データの半透明なマーカー散布図をプロット p2 <- data |> ggplot(aes(x =x, y = y)) + geom_point(alpha = 0.1) + coord_fixed() + labs(title = "半透明マーカー利用") # ヒートマップを生成 # みためがちょっと違うが2次元ヒストグラムなのでよし p3 <- data |> ggplot(aes(x = x, y = y)) + stat_density2d(aes(fill = after_stat(density)), geom="tile", contour=FALSE, n = 20) + scale_fill_gradientn(colours = rev(rainbow(10)[1:8])) + coord_fixed() + labs(title = "ヒートマップ") p1 + p2 +p3
3.3.6 バブルチャートによる情報提示
##### 3.3.6 ##### library(tidyverse) library(ggrepel) library(janitor) # データの読み込み df <- read_csv( "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/3%E7%AB%A0/data/bubble_chart_data.csv" ) |> clean_names() |> filter(x2021_yr2021 != "..") |> mutate(x2021_yr2021 = as.numeric(x2021_yr2021)) region_df <- read_csv( "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/3%E7%AB%A0/data/bubble_chart_region_data.csv" ) |> clean_names() # データの前処理 df_split <- df |> split(df$series_name) names(df_split) <- c("gdp", "life_exp", "population") # GDP, Life expectancy, Populationのデータをマージ data <- df_split$gdp |> select(country_name, country_code, x2021_yr2021) |> left_join( df_split$life_exp |> select(country_name, x2021_yr2021), by = join_by(country_name), suffix = c("_GDP", "_Life_Expectancy") ) |> left_join( df_split$population |> select(country_name, x2021_yr2021), by = join_by(country_name) ) |> rename( x2021_yr2021_Population = x2021_yr2021 ) |> left_join( #regionをCountry Codeでjoin region_df |> select(alpha_3, region), by = join_by(country_code == alpha_3) ) |> left_join( #regionをCountry_Nameでjoin region_df |> select(name, region), by = join_by(country_name == name) ) |> mutate( #region確定 region = if_else(is.na(region.x), region.y, region.x), .keep = "unused" ) |> drop_na() |> #region無い行を削除 mutate(# GDP per capita(千ドル)を計算 GDP_per_capita = x2021_yr2021_GDP / x2021_yr2021_Population / 1000, x2021_yr2021_Population = x2021_yr2021_Population / 100000 ) # 描画 data |> ggplot(aes( x = GDP_per_capita, y = x2021_yr2021_Life_Expectancy, color = region, size = x2021_yr2021_Population, label = country_name )) + geom_point(alpha = 0.6) + scale_size_area(max_size = 11) + #値と面積が比例するように geom_text_repel(size = 3) + scale_x_log10() + labs( title = "色や大きさで情報を付加する", x = "一人あたりGDP [1,000$](対数軸)", y = "平均寿命 [年]", size = "人口(10万人)", color = "地域" ) + theme(aspect.ratio = 1)
第3章3節は以上。