--- title: "MWERA Shapes in Clouds" author: "brooksg@ohio.edu" date: "2024-10-12" output: html_document: toc: yes number_sections: yes editor_options: chunk_output_type: console --- Uses R 4.4.1 (wordcloud2 does not work well in R 4.4.0) ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) par(mar=c(5.1, 4.1, 4.1, 2.1)) par(mfrow=c(1,1)) ``` Websites for Wordclouds * https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a * https://r-graph-gallery.com/wordcloud.html * http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know * https://lukesingham.com/how-to-make-a-word-cloud-using-r/ Packages ```{r echo=FALSE, include=FALSE} #--- # Install #--- pkgs <- c("tm", "textclean", "wordcloud", "SnowballC", "plotly", "RColorBrewer", "wordcloud2", "webshot", "htmlwidgets", "tidyverse", "tidytext") inst <- pkgs %in% installed.packages() if(length(pkgs[!inst])>0) install.packages(pkgs[!inst], dep=T) #--- # Load #--- library("tm") library("textclean") library("wordcloud") library("SnowballC") library("plotly") library("RColorBrewer") library("wordcloud2") library("webshot") library("htmlwidgets") library("tidyverse") library("tidytext") ``` ## Wordclouds ### Load Data Before importing, you should save your file as UNICODE UTF-8 or else you need to replace fancy punctuation yourself (using select, search, replace with simple, or space). This is perhaps easiest to do in a generic text editor like Notepad. For example: * fancy quote marks (fancy “ ” or ' ') * fancy en and em dashes (fancy -- or ---) * fancy ellipsis (fancy ...) * fancy list characters (bullet lists) * and anything else fancy Also, it is easiest to remove URL (web links) yourself. Also, file must end with a blank line. ```{r echo=FALSE, include=FALSE} filePath <- "Data_All_Combined.txt" text <- readLines(filePath) ``` ### Clean the text data This file was imported after saving in Word as UNICODE UTF-8 ```{r echo=FALSE, include=FALSE} text <- tolower(text) # make lowercase text <- gsub("[[:punct:]]", " ", text) # remove all punctuation text <- gsub("[[:digit:]]", " ", text) # remove all numbers text <- gsub("/", " ", text) # remove / text <- gsub(";", " ", text) # remove ; text <- gsub("'", "", text) # remove ' text <- gsub("''", "", text) # remove " text <- gsub("@\\w+", " ", text) # remove at text <- gsub("https:\\w+", " ", text) # links text <- gsub("[ |\t]{2,}", " ", text) # tabs text <- gsub("^ ", " ", text) # remove spaces at start text <- gsub(" $", " ", text) # remove spaces at end ``` ### Create Corpus ```{r echo=FALSE, include=FALSE} library(tm) docs <- tm::Corpus(tm::VectorSource(text)) tm::inspect(docs[1:3]) #--- toSpace <- tm::content_transformer(function (x , pattern ) gsub(pattern, " ", x)) #--- docs <- tm::tm_map(docs, tm::removeNumbers) docs <- tm::tm_map(docs, tm::removePunctuation) docs <- tm::tm_map(docs, tm::removePunctuation, ucp = TRUE) docs <- tm::tm_map(docs, tm::content_transformer(tolower)) docs <- tm::tm_map(docs, toSpace, "/") docs <- tm::tm_map(docs, toSpace, "@") docs <- tm::tm_map(docs, toSpace, "\\|") docs <- tm::tm_map(docs, toSpace, ";") docs <- tm::tm_map(docs, gsub, pattern = ' t ', replacement = ' not ') docs <- tm::tm_map(docs, gsub, pattern = ' ve ', replacement = ' have ') docs <- tm::tm_map(docs, gsub, pattern = ' m ', replacement = ' am ') docs <- tm::tm_map(docs, gsub, pattern = ' s ', replacement = ' ') docs <- tm::tm_map(docs, tm::removeWords, tm::stopwords("english")) docs <- tm::tm_map(docs, tm::removeWords, c("this", "that", "and", "the", "other", "thing")) docs <- tm::tm_map(docs, tm::stripWhitespace) docs tm::inspect(docs[1:3]) ``` ### Create TDM (term-document-matrix) TDM is a Table of counts Some words end up being too long (too many letters). You can remove such words with smaller *maxLength* You can change minimum number of times for a word to be included with *minFreq* You can change the total number of words in the cloud with *maxTerms* ```{r echo=FALSE, include=FALSE} maxTerms <- 200 minFreq <- 2 maxLength <- 25 tdm <- tm::TermDocumentMatrix(docs) mtx <- as.matrix(tdm) wrd <- sort(rowSums(mtx),decreasing=TRUE) head(wrd) dfALL <- data.frame(word = names(wrd), freq = wrd) df <- subset(dfALL, subset = freq >= minFreq) nrow(dfALL) nrow(df) df <- subset(df, subset = nchar(word) <= maxLength) dfBackup <- df df <- df[1:min(nrow(df),maxTerms), ] ``` ### Optional: Stemming ```{r echo=FALSE, include=FALSE} docx <- tm::tm_map(docs, tm::stemDocument) xtdm <- tm::TermDocumentMatrix(docx) xmtx <- as.matrix(xtdm) xwrd <- sort(rowSums(xmtx),decreasing=TRUE) xdf <- data.frame(word = names(xwrd), freq = xwrd) xdf <- subset(xdf, subset = freq >= minFreq) xdf <- subset(xdf, subset = nchar(word) < maxLength) nrow(df) nrow(xdf) head(df) head(xdf) ``` ## Stats (Bar Chart, Other) You can increase number of bars with *maxBars* (but 20 is a good max for bar chart) ```{r echo=FALSE} maxBars <- 20 barplot(df[1:maxBars,]$freq, las = 2, names.arg = df[1:maxBars,]$word, col ="lightblue", main ="Most frequent words", ylab = "Word frequencies") df[1:maxBars,]$word #--- myLow <- 4 w <- sort(rowSums(mtx), decreasing = TRUE) w <- subset(w, w >= myLow) while (length(w) > 30) { myLow <- myLow + 1 w <- subset(w, w >= myLow) } w barplot(w, las = 2, col = rainbow(length(w))) #--- w[1:3] tm::findFreqTerms(tdm, lowfreq = myLow) tm::findAssocs(tdm, terms = names(w[1:3]), corlimit = 0.7) ``` ## Generate Basic Word Cloud Wordcloud shows up in PLOTS output tab Change myScale to make it fit better in the window ```{r echo=FALSE} myScale <- c(3, 0.2) myRot <- 0.35 mySeed <- 1804 set.seed(mySeed) # for reproducibility wordcloud::wordcloud(words = df$word, freq = df$freq, min.freq = minFreq, max.words=maxTerms, scale=myScale, random.order=FALSE, rot.per=myRot, use.r.layout=FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2")) #--- # to save the same plot as a file #--- set.seed(mySeed) # for reproducibility png(filename = "wordcloud0.png", width = 6.5, height = 6.5, units = "in", pointsize = 12, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default") wordcloud::wordcloud(words = df$word, freq = df$freq, min.freq = minFreq, max.words=maxTerms, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2")) dev.off() ``` ## Advanced Word Clouds BEWARE sometimes words do NOT fit and they are excluded... PAY ATTENTION These show up in VIEWER output tab * size = Font size, default is 1. The larger size means the bigger word. * shape = Available presents are 'circle' (default), 'cardioid' (apple or heart shape curve, the most known polar equation), 'diamond' (alias of square), 'triangle-forward', 'triangle', 'pentagon', and 'star'. * ellipticity = degree of "flatness" of the shape wordcloud2.js should draw. #### Default ```{r echo=FALSE} head(df) wordcloud2::wordcloud2(data=df) ``` #### Stemmed ```{r echo=FALSE} head(xdf) wordcloud2::wordcloud2(data = xdf) ``` #### Resize ##### 0.2 ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.2) ``` ##### 0.4 ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.4) ``` ##### 0.8 ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.8) ``` #### Emphasize Frequent words less ```{r echo=FALSE} df1 <- df df1$freq <- log(df$freq^3) wordcloud2::wordcloud2(df1, size = 0.25) ``` #### Reverse Frequency Lower frequency words in the middle ```{r echo=FALSE} wordcloud2::wordcloud2(df[order(df$freq), ], size = 0.4) ``` #### Top N ##### ALL Terms Freq > 0 ```{r echo=FALSE} wordcloud2::wordcloud2(dfALL, size = 0.4) ``` ##### Top 100 Terms ```{r echo=FALSE} wordcloud2::wordcloud2(slice_max(df, order_by = freq, n=100), size = 0.4) ``` ##### Top 40 Terms ```{r echo=FALSE} wordcloud2::wordcloud2(slice_max(df, order_by = freq, n= 40), size = 0.4) ``` ##### Top 20 Terms ```{r echo=FALSE} wordcloud2::wordcloud2(slice_max(df, order_by = freq, n= 20), size = 0.4) ``` #### Shapes ##### Pentagon ALL ```{r echo=FALSE} wordcloud2::wordcloud2(data = dfALL, size = 0.5, shape = 'pentagon', color = 'random-dark') ``` ##### Pentagon Reduced ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.5, shape = 'pentagon', color = 'random-dark') ``` ##### Triangle ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.4, shape = 'triangle', color = 'random-dark', rotateRatio = 0.8, minSize = minFreq) ``` ##### Triangle-forward ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.4, shape = 'triangle-forward', color = 'random-dark') ``` ##### Cardioid ```{r echo=FALSE} wordcloud2::wordcloud2(data = df, size = 0.4, shape = 'cardioid', color = 'random-dark', ellipticity = 0.865) ``` #### Background ```{r echo=FALSE} wordcloud2::wordcloud2(df, color='random-light', size = 0.4, backgroundColor="lavender") ``` #### Orientation ##### Perpendicular ```{r echo=FALSE} wordcloud2::wordcloud2(df, size = 0.5, minRotation = -pi/2, maxRotation = -pi/2) ``` ##### Rotate Some ```{r echo=FALSE} wordcloud2::wordcloud2(df, size = 0.5, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 0.4) ``` ##### Rotate All ```{r echo=FALSE} wordcloud2::wordcloud2(df, minRotation = pi/4, maxRotation = pi/4, size = 0.5, rotateRatio = 1) ``` ##### Lower Ellipticity ```{r echo=FALSE} wordcloud2::wordcloud2(df, size = 0.5, ellipticity = 0.1) ``` ##### Higher Ellipticity ```{r echo=FALSE} wordcloud2::wordcloud2(df, size = 0.5, ellipticity = 5.0) ``` #### Complete Command for a Star ```{r echo=FALSE} starcloud <- wordcloud2::wordcloud2(data = dfALL, size = 0.3, minSize = 0, gridSize = 0, fontFamily = 'Segoe UI', fontWeight = 'normal', color = 'random-dark', backgroundColor = "white", minRotation = -pi/4, maxRotation = pi/4, shuffle = FALSE, rotateRatio = 0.4, shape = 'star', ellipticity = 0.8, widgetsize = NULL, figPath = NULL, hoverFunction = NULL) starcloud ``` ### STEP 7: Save Webshot ```{r eval=FALSE} webshot::install_phantomjs() #--- htmlwidgets::saveWidget(starcloud, "starcloud.html", selfcontained = F) #--- webshot::webshot("starcloud.html", "starcloud.png", vwidth = 800, vheight = 800, delay = 5) #--- webshot::webshot("starcloud.html", "starcloud.pdf", vwidth = 800, vheight = 800, delay = 5) ``` ### STEP 8: More Advanced #### Letter-Shaped Cloud Sometimes need to run this 3-4 times to get it to work... and sometimes never works... ```{r inclue=FALSE, eval=FALSE} wordcloud2::letterCloud(df, word="MW", size=0.8) ``` #### Shapes in Files Sometimes need to run these 3-4 times to get them to work... and seomtimes never works... ```{r inclue=FALSE, eval=FALSE} figFile <- "pumpkin.png" wordcloud2::wordcloud2(dfALL, figPath = figFile, size = 1, color = 'darkorange') ``` ## Sentiment Analysis ```{r echo=FALSE} library(syuzhet) s <- syuzhet::get_nrc_sentiment(text) # head(syuzhet::get_sentences(text)) # head(s) barplot(colSums(s), las = 2, col = rainbow(10), ylab = 'Count', main = 'Sentiment Scores') ``` ## BIGRAM NETWORK These show up in PLOTS output tab ```{r echo=FALSE} myfilter <- 1 myNgram <- 2 df_text <- tibble(num = 1:length(text), text = text) head(df_text) summry <- df_text %>% unnest_tokens(bigram, text, token = "ngrams", n = myNgram) %>% select(bigram) summry %>% count(bigram, sort = TRUE) bigram_split <- summry %>% separate(col = bigram, into = c("word1", "word2"), sep = " ") bigram_cleaned <- bigram_split %>% filter(!word1 %in% stop_words$word) %>% filter(!word2 %in% stop_words$word) bigram_cleaned <- bigram_cleaned %>% count(word1, word2, sort = TRUE) bigram_cleaned <- subset(bigram_cleaned, subset = !is.na(bigram_cleaned$word1)) bigram_cleaned <- subset(bigram_cleaned, subset = n > myfilter) bigram_cleaned %>% unite(col = bigram, word1, word2, sep = " ") tail(bigram_cleaned) library(ggraph) graph <- igraph::graph_from_data_frame(bigram_cleaned) network_plot <- bigram_cleaned %>% filter(n > myfilter) %>% igraph::graph_from_data_frame() wordnet <- network_plot %>% ggraph(layout = "kk") + geom_edge_link(aes(col = factor(n))) + geom_node_point() + geom_node_text(aes(label = name)) wordnet png(filename = "wordnetwork1.png", width = 6.5, height = 6.5, units = "in", pointsize = 11, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default") wordnet dev.off() ``` ## Compare Participants Files #### Compare STEP 1: Open Open files in a subfolder ```{r echo=FALSE, include=FALSE} docsComp <- tm::Corpus(tm::DirSource(directory = "Participants")) # tm::inspect(docsComp) ``` #### Compare STEP 2: Clean ```{r echo=FALSE, include=FALSE} docsComp <- tm::tm_map(docsComp, tm::content_transformer(tolower)) docsComp <- tm::tm_map(docsComp, tm::removeNumbers) docsComp <- tm::tm_map(docsComp, tm::removePunctuation) docsComp <- tm::tm_map(docsComp, tm::removePunctuation, ucp = TRUE) docsComp <- tm::tm_map(docsComp, tm::stripWhitespace) docsComp <- tm::tm_map(docsComp, tm::removeWords, tm::stopwords("english")) docsComp <- tm::tm_map(docsComp, tm::removeWords, c("this", "that", "the", "other")) # tm::inspect(docsComp) ``` #### Compare STEP 3: Matrix ```{r echo=FALSE, include=FALSE} minFreqC <- 2 maxWordC <- 25 tdmComp <- tm::TermDocumentMatrix(docsComp) mtxComp <- as.matrix(tdmComp) wrdComp <- sort(rowSums(mtxComp),decreasing=TRUE) dfC <- data.frame(word = names(wrdComp), freq = wrdComp) dfC <- subset(dfC, subset = freq > minFreqC) dfC <- subset(dfC, subset = nchar(word) < maxWordC) head(dfC) nrow(dfC) dfComp <- dfC ``` ### Compare STEP 4: Combine ```{r echo=FALSE} wordcloud2::wordcloud2(data=dfC, size=0.5) ``` ### Compare STEP 5: Compare ```{r echo=FALSE} colnames(mtxComp) <- c("Participant_1", "Participant_2", "Participant_3") png(filename = "comparecloud0.png", width = 6.5, height = 6.5, units = "in", pointsize = 12, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default") wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2)) dev.off() wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2)) ``` ### Compare STEP 6: Commonality ```{r echo=FALSE} wordcloud::commonality.cloud(mtxComp, max.words = 200, random.order = FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2")) ``` ## Compare Questions Files #### Compare STEP 1: Open Open files in a subfolder ```{r echo=FALSE, include=FALSE} docsComp <- tm::Corpus(tm::DirSource(directory = "Questions")) # tm::inspect(docsComp) ``` #### Compare STEP 2: Clean ```{r echo=FALSE, include=FALSE} docsComp <- tm::tm_map(docsComp, tm::content_transformer(tolower)) docsComp <- tm::tm_map(docsComp, tm::removeNumbers) docsComp <- tm::tm_map(docsComp, tm::removePunctuation) docsComp <- tm::tm_map(docsComp, tm::removePunctuation, ucp = TRUE) docsComp <- tm::tm_map(docsComp, tm::stripWhitespace) docsComp <- tm::tm_map(docsComp, tm::removeWords, tm::stopwords("english")) docsComp <- tm::tm_map(docsComp, tm::removeWords, c("this", "that", "the", "other")) # tm::inspect(docsComp) ``` #### Compare STEP 3: Matrix ```{r echo=FALSE, include=FALSE} minFreqC <- 2 maxWordC <- 25 tdmComp <- tm::TermDocumentMatrix(docsComp) mtxComp <- as.matrix(tdmComp) wrdComp <- sort(rowSums(mtxComp),decreasing=TRUE) dfC <- data.frame(word = names(wrdComp), freq = wrdComp) dfC <- subset(dfC, subset = freq > minFreqC) dfC <- subset(dfC, subset = nchar(word) < maxWordC) head(dfC) nrow(dfC) dfComp <- dfC ``` ### Compare STEP 4: Combine ```{r echo=FALSE} wordcloud2::wordcloud2(data=dfC, size=0.5) ``` ### Compare STEP 5: Compare ```{r echo=FALSE} png(filename = "comparecloudP.png", width = 6.5, height = 6.5, units = "in", pointsize = 12, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default") wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2)) dev.off() wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2)) ``` ### Compare STEP 6: Commonality ```{r echo=FALSE} wordcloud::commonality.cloud(mtxComp, max.words = 200, random.order = FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2")) ``` # END
View Site in Mobile | Classic
Share by: