『データ可視化学入門』をRで書く 1.2 可視化の効果を考える
1.2 可視化の効果を考える
この記事はR言語 Advent Calendar 2023 シリーズ2の2日目の記事です。
qiita.com
1.2.1 文字情報と視覚情報による提示の違い
##### 1.2.1 ##### library(tidyverse) library(patchwork) # データフレームの作成 df <- data.frame( Country = c("日本", "ブラジル", "米国", "中国"), # 国のリスト Population = c(124620000, 215802222, 335540000, 1425849288) / 100000000 # 人口のリスト ) # 米国と日本のみを含む棒グラフの描画 p1 <- df |> dplyr::filter(Country %in% c("米国","日本")) |> arrange(desc(Population)) |> ggplot(aes(x = Population, y = reorder(Country, Population))) + geom_col() + labs(x = "人口[億人]", y = "国名") # 四か国を含む棒グラフの描画 p2 <- df |> arrange(desc(Population)) |> ggplot(aes(x = Population, y = reorder(Country, Population))) + geom_col() + labs(x = "人口[億人]", y = "国名") p1 / p2
1.2.2 2変数データにおけるパターンの発見
##### 1.2.2 ##### data <- data.frame( x = c(0.204, 1.07, -0.296, 0.57, 0.637, 0.82, 0.137, -0.046), y = c(0.07, 0.57, 0.936, 1.436, 0.32, 1.003, 1.186, 0.503) ) square_points <- data.frame( x = c(-0.296, 0.57, 1.07, 0.204, -0.296), y = c(0.936, 1.436,0.57,0.07, 0.936) ) ggplot(data, aes(x = x, y = y)) + geom_point() + geom_path(square_points, mapping = aes(x = x, y = y), color = "red", linetype = "dashed") + coord_fixed() + labs(title = "視覚情報による提示")
1.2.3 重要なつながりだけ抜き出す
これはうまくできていない。
都道府県コード順に円環状に並べる方法が分からない。
library(tidyverse) library(igraph) pref_ja <- c( "北海道", "青森", "岩手", "宮城", "秋田", "山形", "福島", "茨城", "栃木", "群馬", "埼玉", "千葉", "東京", "神奈川", "新潟", "富山", "石川", "福井", "山梨", "長野", "岐阜", "静岡", "愛知", "三重", "滋賀", "京都", "大阪", "兵庫", "奈良", "和歌山", "鳥取", "島根", "岡山", "広島", "山口", "徳島", "香川", "愛媛", "高知", "福岡", "佐賀", "長崎", "熊本", "大分", "宮崎", "鹿児島", "沖縄" ) # データをロード data_df <- read_csv( "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/1%E7%AB%A0/data/matrix.csv" ) # 前処理 data_matrix <- data_df[, -1] |> as.matrix(nrow = 47) diag(data_matrix) <- 0 data_matrix <- log1p(data_matrix) rownames(data_matrix) <- colnames(data_matrix) <- pref_ja # しきい値を計算 threshold <- quantile(data_matrix, .95) # しきい値をもとにグラフデータを作る gData <- data_matrix |> as.data.frame() |> rownames_to_column(var = "pref") |> pivot_longer(!pref) |> rename( from = pref, to = name, weight = value ) |> dplyr::filter(weight >= threshold) # グラフオブジェクトを生成 g <- graph_from_data_frame(gData, directed=TRUE) E(g)$width <- E(g)$weight/5 E(g)$color <- rainbow(100, alpha = 0.5)[round(E(g)$weight/max(E(g)$weight) * 100, 0)] V(g)$label.cex <- 0.5 V(g)$size <- 10 g |> plot( layout = layout_in_circle, edge.curved = 0.5, edge.arrow.size = 0.5 )
1.2.4 全体の関係性パターンを見つける
この図は書籍だと、ラベルが都道府県コード順になっているが、クラスタリングの結果なのでそんなはずはない。
誤植と思われる。
##### 1.2.4 ##### library(tidyverse) library(pheatmap) jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000")) pref_ja <- c( "北海道", "青森", "岩手", "宮城", "秋田", "山形", "福島", "茨城", "栃木", "群馬", "埼玉", "千葉", "東京", "神奈川", "新潟", "富山", "石川", "福井", "山梨", "長野", "岐阜", "静岡", "愛知", "三重", "滋賀", "京都", "大阪", "兵庫", "奈良", "和歌山", "鳥取", "島根", "岡山", "広島", "山口", "徳島", "香川", "愛媛", "高知", "福岡", "佐賀", "長崎", "熊本", "大分", "宮崎", "鹿児島", "沖縄" ) # データをロード data_df <- read_csv( "https://raw.githubusercontent.com/tkEzaki/data_visualization/main/1%E7%AB%A0/data/matrix.csv" ) # 前処理 data_matrix <- data_df[, -1] |> as.matrix(nrow = 47) data_matrix <- log1p(data_matrix) rownames(data_matrix) <- colnames(data_matrix) <- pref_ja # ヒートマップ pheatmap( data_matrix, clustering_distance_rows = "euclidean", clustering_method = "ward.D2", border_color = NA, color = jet.colors(50) )
1.2.5 様々なデータの並べ方
ここは1.2.5と1.2.6をまとめて。
棒グラフの色をグラデーションにしたのはこちらのアレンジ。
library(tidyverse) library(patchwork) # データ data_df <- data.frame( code = 1:47, prefectures = c( "北海道", "青森", "岩手", "宮城", "秋田", "山形", "福島", "茨城", "栃木", "群馬", "埼玉", "千葉", "東京", "神奈川", "新潟", "富山", "石川", "福井", "山梨", "長野", "岐阜", "静岡", "愛知", "三重", "滋賀", "京都", "大阪", "兵庫", "奈良", "和歌山", "鳥取", "島根", "岡山", "広島", "山口", "徳島", "香川", "愛媛", "高知", "福岡", "佐賀", "長崎", "熊本", "大分", "宮崎", "鹿児島", "沖縄" ), shipments = c( 42334, 31897, 56667, 177842, 31295, 39663, 110027, 307752, 173267, 186811, 579061, 299455, 580563, 368026, 83590, 79298, 42741, 53484, 99220, 136407, 190720, 240332, 568222, 159106, 136137, 110115, 432778, 291430, 136335, 35777, 37678, 22612, 108003, 176715, 65910, 29745, 48454, 49767, 20554, 181309, 95824, 34208, 66604, 41907, 37888, 44466, 5240 )/1000 # 桁が大きいので千t単位に調整 ) # 棒グラフ # 五十音順は割愛、出荷量順を追加 # 出荷量に応じてグラデーションを追加 p1 <- data_df |> ggplot(aes(x = reorder(prefectures, code), y = shipments, fill = shipments)) + geom_col() + labs(x = "", y = "出荷量[千t]", title = "都道府県コード順") + scale_fill_gradient(low = "blue", high = "red") + theme( axis.text.x = element_text(angle = 270, hjust = 1), aspect.ratio = 1/3, legend.position = "none" ) #縦横比 p2 <- data_df |> ggplot(aes(x = reorder(prefectures, desc(shipments)), y = shipments, fill = shipments)) + geom_col() + labs(x = "", y = "出荷量[千t]", title = "出荷量順") + scale_fill_gradient(low = "blue", high = "red") + theme( axis.text.x = element_text(angle = 270, hjust = 1), aspect.ratio = 1/3, legend.position = "none" ) p1 / p2
北海道をずらすのは手間なので割愛。
#地図にデータを可視化する library(NipponMap) jet.colors <- colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000")) data_df <- data_df |> mutate( col_num = round(log(shipments)/max(log(shipments))*100), col_name = jet.colors(100)[col_num] ) JapanPrefMap(col = data_df$col_name)
第1章2節は以上。