Skip to content

Latest commit

 

History

History
 
 

TextMining

Folders and files

NameName
Last commit message
Last commit date

parent directory

..
 
 

#Getting Started with Text Mining in R

##Prerequisites

In this talk, I am using R version 3.1.3 (2015-03-09) -- "Smooth Sidewalk" for my examples. Any recent version of R should also be fine. We'll be using Revision 1345 of the tm Package.

You'll need to load the following R packages to carry out the examples below.

library(cluster)  
library(ggdendro)
library(ggplot2)
library(reshape2)
library(tm)

##Creating a Corpus

Ted Underwood collected this data from Project Gutenberg. See his excellent introduction to text mining Where to start with text mining as well as his updated post titled Seven ways humanists are using computers to understand text.

You may download the data for the corpus we'll be building here. You should unzip the files to a folder titled 19cTexts on your Desktop.

setwd("/Users/Clifford/Desktop/19cTexts")
corpus <- Corpus(DirSource("1800-09"))
corpus
clean <- corpus[-1]
corpus[[2]]$content
(corpus[[2]]$content)[5:15]
corpus[[2]]$meta
meta(corpus[[2]])
DublinCore(corpus[[2]])
DublinCore(corpus[[2]], tag="title") <- strsplit(corpus[[1]]$content, '\t')[[2]][14]
writeCorpus(corpus, path = "./corpus", filenames = DublinCore(corpus)$identifier)

##Cleaning a Corpus

getTransformations()
[1] "removeNumbers"     "removePunctuation" "removeWords"       "stemDocument"      "stripWhitespace"
clean <- tm_map(clean, stripWhitespace)
clean <- tm_map(clean, content_transformer(tolower))
clean <- tm_map(clean, removeNumbers)
clean <- tm_map(clean, removePunctuation)
 clean <- tm_map(clean, content_transformer(gsub), pattern = 'new york', replacement = 'newyork')
clean <- tm_map(clean, removeWords, stopwords("english"))
stem  <- tm_map(clean, stemDocument, language = "english")  
writeCorpus(corpus, path = "./corpus", filenames = DublinCore(corpus)$identifier)

##Exploring Term Document Matrices

tdm <- TermDocumentMatrix(clean)
dtm <- DocumentTermMatrix(clean)
inspect(DocumentTermMatrix(clean,list(dictionary = c("economy", "money", "finance", "debt", "income", "expenditures", "bonds", "stocks"))))
findFreqTerms(tdm, 10000, Inf)
findAssocs(tdm, "money", .85)
$money
    subsidy    merchant         pay      paying        sums       emden     packers        paid 
       0.93        0.91        0.91        0.91        0.91        0.90        0.90        0.90 
    trading     payment silkweavers   southwark      buying     coinage      coined     foreign 
       0.90        0.89        0.89        0.89        0.88        0.88        0.88        0.88 
 foreigners      london   merchants    purchase        sell     brokers    licences       ports 
       0.88        0.88        0.88        0.88        0.88        0.87        0.87        0.87 
 privileges  stipulated      truely     wherein     charter  commercial     enacted     england 
       0.87        0.87        0.87        0.87        0.86        0.86        0.86        0.86 
  exempting   exporting       faked       penny      prices    subjects    commerce       duely 
       0.86        0.86        0.86        0.86        0.86        0.86        0.85        0.85 
fishmongers     granted       later       naval   newcastle  poflefllon     selling thenceforth 
       0.85        0.85        0.85        0.85        0.85        0.85        0.85        0.85 

##Visualizing A Corpus

###Making a Heatmap

This example is adapted slightly from Text Mining the Complete Works of William Shakespeare by Andrew Collier.

common <- removeSparseTerms(tdm, 0.005)
common <- as.matrix(common)
common <- common[,-1]
dense = melt(common, value.name = "count")
g <- ggplot(dense, aes(x = Docs, y = Terms, fill = log(count)))
g <- g + geom_tile(colour = "grey")
g <- g + scale_fill_gradient(high="#114357" , low="#F29492")
g <- g +  theme(axis.text.x = element_text(angle = 90, hjust = 1))
g <- g + ggtitle("Heatmap of Term Frequency")
g

Heatmap

###Clustering Similar Words

This visualization is adapted slightly from Basic Text Mining in R.

sparse <- removeSparseTerms(dtm, 0.01)
d <- dist(t(sparse), method="euclidian")   
fit <- hclust(d=d, method="ward.D2")
g <- ggplot(segment(ddata))
g <- g + geom_segment(aes(x = x, y = y, xend = xend, yend = yend))
g <- g + geom_text(data = ddata$labels, aes(x = x, y = y, label = label))
g <- g + coord_flip() 
g <- g + theme_dendro()
g <- g + ggtitle("Clusters of Most Frequent Terms")
g

dendrogram

##Next Steps