# We are going to use several packages. # Make sure that the packages are installed. # # install.packages(c("tm", "SnowballC", "wordcloud", "proxy", "kernlab", "NLP", "openNLP")) library(ggplot2) # We also have to install models for the English language # # install.packages("openNLPmodels.en", repos="http://datacube.wu.ac.at/", type="source") # Load the classic TIME Magazine article collection consisting of 425 articles from the TIME Magazine of the 1960's conn = file("time.mag.txt", open="r") text = readLines(conn) close(conn) length(text) head(text) # Load a framework for text mining applications within R. library(tm) # Construct a corpus for a vector as input. corpus <- Corpus(VectorSource(text)) length(corpus) content(corpus[[1]]) # # Transforming Text # # Change letters to lower case corpus <- tm_map(corpus, content_transformer(tolower)) # Remove punctuations corpus <- tm_map(corpus, removePunctuation) # Remove numbers corpus <- tm_map(corpus, removeNumbers) # Remove stopwords (these are some of the most common, short function words, such as the, is, at, which, etc.) corpus <- tm_map(corpus, removeWords, stopwords('english')) # Read the custom stopwords list conn = file("english.stop.txt", open="r") mystopwords = readLines(conn) close(conn) # Inspect our custom stopwords list mystopwords # Remove stopwords corpus <- tm_map(corpus, removeWords, mystopwords) # Stem words to retrieve their radicals, so that various forms derived from a stem would be taken as the same # when counting word frequency corpus <- tm_map(corpus, stemDocument) # Strip extra whitespace from text documents corpus <- tm_map(corpus, stripWhitespace) # Have a look at the first document in the corpus content(corpus[[1]]) # # A Term-Document Matrix # # A term-document matrix represents the relationship between terms and documents, # where each row stands for a term and each column for a document, and an entry # is the number of occurrences of the term in the document. tdm <- TermDocumentMatrix(corpus) # The term-document matrix is composed of 14557 terms and 425 documents. # It is very sparse, with 99% of the entries being zero. tdm # Retrieve the list of terms rownames(tdm) # Find the term "moscow" idx <- which(rownames(tdm) == "moscow") idx # Have a look at the term "moscow" in the documents inspect(tdm[idx,]) # # Frequent Terms # # Inspect frequent words (with frequency no less than 300) findFreqTerms(tdm, lowfreq=300) # The plot shows the most frequent words in the corpus termFrequency <- rowSums(as.matrix(tdm)) termFrequency <- subset(termFrequency, termFrequency >= 300) qplot(seq(length(termFrequency)),sort(termFrequency), xlab = "index", ylab = "Freq") # A word cloud is a visual representation for text data. # Tags are single words, and the importance of each tag is shown with font size and color. library(wordcloud) # Convert the term-document matrix to a normal matrix and calculate word frequencies mat <- as.matrix(tdm) wordFreq <- sort(rowSums(mat), decreasing=TRUE) grayLevels <- gray((wordFreq+10) / (max(wordFreq)+10)) wordcloud(words=names(wordFreq), freq=wordFreq, min.freq=100, random.order=F, colors=grayLevels) # # Associations # # Find terms associated with "moscow" with correlation no less than 0.5 findAssocs(tdm, "moscow", 0.5) # # Clustering of words according to their co-occurrence in documents # # Remove sparse terms that have at least 70% of empty elements tdm2 <- removeSparseTerms(tdm, sparse=0.7) mat <- as.matrix(tdm2) # The distances between terms can also be calculated using the dist() function distMatrix <- dist(mat) # Find clusters of words with hierarchical clustering fit <- hclust(distMatrix, method="ward.D") plot(fit) # # Clustering of documents according to co-occurrence of terms # # Constructs a document-term matrix dtm <- DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf)) mat <- as.matrix(dtm) # Cluster the documents using the kmeans method with the number of clusters set to 3 k <- 3 kmeansResult <- kmeans(mat, k) # Find the most popular words in every cluster for (i in 1:k) { s <- sort(kmeansResult$centers[i,], decreasing=T) cat(names(s)[1:10], "\n") } # Compute the dissimilarity between the first two documents using the cosine distance measure. # Cosine similarity is a measure of similarity between two vectors of an inner product space that measures the cosine of the angle between them. # Each term represents a different dimension and a document is characterised by a vector where the value of each dimension corresponds to # the number of times that term appears in the document. Cosine similarity then gives a useful measure of how similar two documents are likely # to be in terms of their subject matter. library(proxy) content(corpus[[1]]) content(corpus[[2]]) dist(as.matrix(dtm)[c(1,2),], method = "cosine") # Find the most similar documents in the term-document matrix (this may take a few moments to compute...) dtm2 <- removeSparseTerms(dtm, sparse=0.7) mat <- as.matrix(dtm2) dist.mat <- as.matrix(dist(mat, method = "cosine")) sim.idx <- which(dist.mat == min(dist.mat[dist.mat > 0]), arr.ind = TRUE) sim.idx # Have a look at these documents content(corpus[[sim.idx[1,1]]]) content(corpus[[sim.idx[1,2]]]) dist(mat[c(sim.idx[1,1], sim.idx[1,2]),], method = "cosine") # # # Document classification # # # Construct a corpus for a directory as source. Each file in this directory is considered to be a document. corpus <-Corpus(DirSource("econ")) length(corpus ) # Corpus preprocessing library(SnowballC) corpus <- tm_map(corpus , content_transformer(tolower)) corpus <- tm_map(corpus , removeWords, stopwords('english')) corpus <- tm_map(corpus , removeNumbers) corpus <- tm_map(corpus , removePunctuation) corpus <- tm_map(corpus , stemDocument) corpus <- tm_map(corpus , stripWhitespace) # # Transforming the corpus into a data set # # Construct a document-term matrix data.tfidf <- DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf)) # Read the document classes Topic <-read.table("economics-topics.txt") # Visualize mat via decomposition (e.g., PCA) econ.mat <- as.matrix(data.tfidf) text.pca <- prcomp(econ.mat) dim(text.pca$x) length(Topic$V1) comp1 <- as.numeric(text.pca$x[,1]) comp2 <- as.numeric(text.pca$x[,2]) # Let's visualize the possible separation in 2D orthogonal space! qplot(comp1, comp2, color = Topic$V1) # Or nonlinear with t-SNE library(Rtsne) tsne.proj <- Rtsne(econ.mat, perplexity=20, theta=0.2, dims=2, check_duplicates = F) df.tsne <-tsne.proj$Y qplot(df.tsne[,1],df.tsne[,2],color = Topic$V1) # Construct a data set data <- cbind(econ.mat, Topic) names(data)[ncol(data)] <- "Topic" sel <- sample(nrow(data), 200, F) train <- data[-sel,] test <- data[sel,] #names(train) # # Document classification using kNN models # library(class) # Identify the class column r <- which(names(data)=="Topic") predicted <- knn(train[,-r], test[,-r], train$Topic) observed <- test$Topic t <- table(observed, predicted) t # Classification accuracy sum(diag(t))/sum(t) # Recall is the fraction of relevant instances that are retrieved (here calculated for the class "general") t[1,1]/sum(t[1,]) # Precision is the fraction of retrieved instances that are relevant (here calculated for the class "general") t[1,1]/sum(t[,1]) # # Document classification using SVM # library(caret) library(e1071) # for SVMs # Alternate library option below #library(kernlab) #model.svm <- ksvm(Topic ~ ., train, kernel = "rbfdot") #model.svm <- ksvm(Topic ~ ., train, kernel = "poly", kpar=list(degree=2)) # Let's train on a subset (last ~1k features) Exercise: Relax this threshold and observe the performance! num.features <- 500 svm.train <- train[,(ncol(train)-num.features):ncol(train)] svm.test <- test[,(ncol(test)-num.features):ncol(test)] set.seed(112233) library(parallel) # Calculate the number of cores no_cores <- detectCores() - 1 library(doParallel) # create the cluster for caret to use cl <- makePSOCKcluster(no_cores) registerDoParallel(cl) # svm with a linear kernel model.svm <- train(as.factor(Topic) ~., data = svm.train, method = "svmLinear") predicted <- predict(model.svm, svm.test, type = "raw") t <- table(observed, predicted) t # Classification accuracy sum(diag(t))/sum(t) # Recall (here calculated for the class "general") t[1,1]/sum(t[1,]) # Precision (here calculated for the class "general") t[1,1]/sum(t[,1]) # svm with a polynomial kernel model.svm <- train(as.factor(Topic) ~., data = svm.train, method = "svmPoly") predicted <- predict(model.svm, svm.test, type = "raw") t <- table(observed, predicted) t # Classification accuracy sum(diag(t))/sum(t) # Recall is the fraction of relevant instances that are retrieved (here calculated for the class "general") t[1,1]/sum(t[1,]) # Precision is the fraction of retrieved instances that are relevant (here calculated for the class "general") t[1,1]/sum(t[,1]) stopCluster(cl) registerDoSEQ() # # Tokenization, sentence segmentation, part-of-speech tagging # detach("package:caret", unload=TRUE) detach("package:ggplot2", unload=TRUE) library(NLP) library(openNLP) library(openNLPmodels.en) # install.packages("openNLPmodels.en", dependencies=TRUE, repos = "http://datacube.wu.ac.at/") s <- "Steven Allan Spielberg is an American filmmaker and business magnate. Spielberg is consistently considered as one of the leading pioneers of the New Hollywood era, as well as being viewed as one of the most popular and influential filmmakers in the history of cinema." s <- as.String(s) # Generate an annotator which computes sentence annotations sent_ann <- Maxent_Sent_Token_Annotator() sent_ann a1 <- annotate(s, sent_ann) a1 # Extract sentences s[a1] # Generate an annotator which computes word token annotations word_ann <- Maxent_Word_Token_Annotator() word_ann a2 <- annotate(s, word_ann, a1) a2 # Extract words a2w <- subset(a2, type == "word") s[a2w] # Generate an annotator which computes POS tag annotations pos_ann <- Maxent_POS_Tag_Annotator() pos_ann a3 <- annotate(s, pos_ann, a2) a3 # Part of Speech labels # # NN - Noun # NNP - Proper noun # NNS - Noun, plural # VB - Verb # VBD - Verb, past tense # VBG - Verb, gerund or present participle # VBN - Verb, past participle # VBP - Verb, non�3rd person singular present # VBZ - Verb, 3rd person singular present # DT - Determiner # JJ - Adjective # JJR - Adjective, comparative # JJS - Adjective, superlative # IN - Preposition or subordinating conjunction # PRP - Personal pronoun # RB - Adverb # RBR - Adverb, comparative # RBS - Adverb, superlative # CC - Conjunction # CD - Cardinal number # ... a3w <- subset(a3, type == "word") a3w # Extract token/POS pairs tags <- vector() for (i in 1:length(a3w$features)) tags <- c(tags, a3w$features[[i]]$POS) table(tags) tokenPOS <- cbind(s[a3w], tags) tokenPOS # Identify all nouns in the text tokenPOS[substr(tokenPOS[,2], 1, 2) == "NN", 1] # Identify all verbs in the text tokenPOS[substr(tokenPOS[,2], 1, 2) == "VB", 1] # Identify all adjectives in the text tokenPOS[substr(tokenPOS[,2], 1, 2) == "JJ", 1] # # Entity recognition for persons, dates, locations, and organizations # # install.packages("openNLPmodels.en", dependencies=TRUE, repos = "http://datacube.wu.ac.at/") s <- "LeBron Raymone James (born on December 30, 1984, in Akron, Ohio) is an American professional basketball player for the Cleveland Cavaliers of the National Basketball Association (NBA)." s <- as.String(s) sent_ann <- Maxent_Sent_Token_Annotator() word_ann <- Maxent_Word_Token_Annotator() person_ann <- Maxent_Entity_Annotator(kind = "person") date_ann <- Maxent_Entity_Annotator(kind = "date") location_ann <- Maxent_Entity_Annotator(kind = "location") organization_ann <- Maxent_Entity_Annotator(kind = "organization") ann <- annotate(s, list(sent_ann, word_ann, person_ann, date_ann, location_ann, organization_ann)) ann entities <- function(annots, kind) { k <- sapply(annots$features, `[[`, "kind") s[annots[k == kind]] } entities(ann, "person") entities(ann, "date") entities(ann, "location") entities(ann, "organization")