This is actually a very small project back to a while ago…Dataset can be found here which includes blogs, news and twitter as text files. It’s about 548MB.
Using RStudio, we’ll download dataset and generate two word clouds.
1 Download and extract data
setwd("~/wordcloud")
file.url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./Coursera-SwiftKey.zip")) {
download.file(file.url, destfile="./Coursera-SwiftKey.zip", method="curl")
}
if(!file.exists("./final")) {
unzip("./Coursera-SwiftKey.zip")
}
2 Read blogs, news and twitter files which are in English into R
if(!file.exists("./blogs.RData")) {
blogs.con <- file("./final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(blogs.con, encoding="UTF-8", skipNul = TRUE, warn = FALSE)
close(blogs.con)
save(blogs, file="blogs.RData")
}
if(!file.exists("./news.RData")) {
news.con <- file("./final/en_US/en_US.news.txt", "rb")
news <- readLines(news.con, encoding="UTF-8", skipNul = TRUE, warn = FALSE)
close(news.con)
save(news, file="news.RData")
}
if(!file.exists("./twitter.RData")) {
twitter.con <- file("./final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(twitter.con, encoding="UTF-8", skipNul = TRUE, warn = FALSE)
close(twitter.con)
save(twitter, file="twitter.RData")
}
3 Look at how many lines of each file
load("blogs.RData")
load("news.RData")
load("twitter.RData")
# get total line numbers for each file
blogs.nrow <- length(blogs)
news.nrow <- length(news)
twitter.nrow <- length(twitter)
row.data <- c(blogs.nrow, news.nrow, twitter.nrow)
names(row.data) <- c("blogs", "news", "twitter")row.data## blogs news twitter
## 899288 1010242 2360148barplot(row.data, ylab="Number of Lines", main="Bar Plot to Number of Lines")
4 For testing purpose, we’d just get samples from each file.
# set random number to do sampling - only sample 0.1% data to do analysis.
set.seed(1234)
blogs.random <- as.logical(rbinom(n=blogs.nrow, size=1, prob=0.001))
set.seed(2345)
news.random <- as.logical(rbinom(n=news.nrow, size=1, prob=0.001))
set.seed(3456)
twitter.random <- as.logical(rbinom(n=twitter.nrow, size=1, prob=0.001))
# do sampling and create subsets
blogs.sample <- blogs[blogs.random]
news.sample <- news[news.random]
twitter.sample <- twitter[twitter.random]
save(blogs.sample, file="blogs.sample.RData")
save(news.sample, file="news.sample.RData")
save(twitter.sample, file="twitter.sample.RData")
5 Take a quick look at the distribution of number of words in each document in news sample dataset — it seems most of them are “short news”…
load("news.sample.RData")library(stringi)
news.sample.nword <- stri_count(news.sample,regex="\\S+")library(ggplot2)
ggplot(data.frame(x=news.sample.nword), aes(x)) + geom_histogram(bins=30) +
labs(title="Histogram of document word count from news sample data") +
labs(x="Number of words", y="Document Count")
6 Combine blog, news and twitter sample files together and get unigram word cloud
full.sample <- c(blogs.sample, news.sample, twitter.sample)library(tm)
full.sample.corpus <- VectorSource(full.sample)
full.sample.corpus <- VCorpus(full.sample.corpus)full.sample.corpus <- tm_map(full.sample.corpus, content_transformer(tolower))
full.sample.corpus <- tm_map(full.sample.corpus, removeNumbers)
full.sample.corpus <- tm_map(full.sample.corpus, removePunctuation)
full.sample.corpus <- tm_map(full.sample.corpus, stripWhitespace)
full.sample.corpus <- tm_map(full.sample.corpus, removeWords, stopwords(kind="en"))library(RWeka)
library(wordcloud)
full.sample.dtm1 <- DocumentTermMatrix(full.sample.corpus)
full.sample.dtm1.new <- removeSparseTerms(full.sample.dtm1,sparse = 0.99)
full.sample.dtm1.m <- as.matrix(full.sample.dtm1.new)
full.sample.frequency1 <- colSums(full.sample.dtm1.m)wordcloud(names(full.sample.frequency1), full.sample.frequency1, min.freq = 15,
scale = c(4,.2), colors = brewer.pal(6, 'Dark2'))
So what words we’re using most — “one just said…will”?
7 How about bigram?
ngram.tokenizer.2 <- function (x) NGramTokenizer(x, Weka_control(min=2, max=2))full.sample.dtm2 <- DocumentTermMatrix(full.sample.corpus, control=list(tokenize=ngram.tokenizer.2))
full.sample.dtm2.new <- removeSparseTerms(full.sample.dtm2,sparse = 0.999)
full.sample.dtm2.m <- as.matrix(full.sample.dtm2.new)
full.sample.frequency2 <- colSums(full.sample.dtm2.m)wordcloud(names(full.sample.frequency2), full.sample.frequency2, min.freq = 5,
scale = c(2,.2), colors = brewer.pal(6, 'Dark2'))
I’ve done more trigram etc. but not list them here…what’s use of this Ngrams? Can we predict next word one wants to say based on this limited dataset?
Sure we can give it a try another time.
Happy Reading!