UP | HOME

A Manhattan plot of the words of English

A Manhattan plot refers to the graphical display of GWAS results across the chromosomes. Here is a Manhattan plot of the first two letters of the words of English.

## Print a dictionary as a Manhattan plot. Steve Bagley, Sept 2015.

library(data.table)
library(RColorBrewer)

words <- fread("/usr/share/dict/web2", header = FALSE)
setnames(words, "V1", "word")
## only use letters
words[, word := gsub("-", "", word, fixed = TRUE)]
## store bigram only using lowercase
words[, bigram := tolower(substr(word, 1, 2))]
## count bigram occurrences
tab <- words[, .N, by = bigram]
## build percentiles
dist_fn <- tab[, ecdf(N)]
tab[, percentile := dist_fn(N)]
tab[, initial := substr(bigram, 1, 1)]
## get colors. palette only has 8 colors, so wrap around to cover alphabet
plot_colors <- rep(brewer.pal(8, "Dark2"), length.out = 26)
## color is set by initial letter of bigram
tab[, color := plot_colors[strtoi(initial, base = 36) - 9]]
tab[, index := as.numeric(rownames(tab))]
## build plot, don't label x-axis yet, and make room for label of highest point
tab[, plot(index, N, col = color, pch = 19, cex = 0.5, xaxt = "n", ylim = c(0, 15500))]
## put the initial letter at the mean location of that letter
alphabet <- tab[, .(mean_loc = mean(index), min_loc = min(index), max_loc = max(index)), by = initial]
## some very light gray lines between the letters
alphabet[, abline(v = c(min_loc - 0.5, max(max_loc) + 0.5), col = "grey95")]
## draw them on x-axis, in color
alphabet[, mtext(initial, side = 1, at = mean_loc, col = plot_colors)]
## put the points back on top of background
tab[, points(index, N, col = color, pch = 19, cex = 0.5, xaxt = "n")]
## mark outliers with bigram label
tab[percentile > 0.95, text(index, N, labels = bigram, pos = 3, col = "grey45")]

Author: Steven Bagley

Date: 2015-09-19 Sat