---
title: "MWERA Shapes in Clouds"
author: "brooksg@ohio.edu"
date: "2024-10-12"
output:
html_document:
toc: yes
number_sections: yes
editor_options:
chunk_output_type: console
---
Uses R 4.4.1
(wordcloud2 does not work well in R 4.4.0)
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
par(mar=c(5.1, 4.1, 4.1, 2.1))
par(mfrow=c(1,1))
```
Websites for Wordclouds
* https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a
* https://r-graph-gallery.com/wordcloud.html
* http://www.sthda.com/english/wiki/text-mining-and-word-cloud-fundamentals-in-r-5-simple-steps-you-should-know
* https://lukesingham.com/how-to-make-a-word-cloud-using-r/
Packages
```{r echo=FALSE, include=FALSE}
#---
# Install
#---
pkgs <- c("tm", "textclean", "wordcloud", "SnowballC", "plotly", "RColorBrewer", "wordcloud2", "webshot", "htmlwidgets", "tidyverse", "tidytext")
inst <- pkgs %in% installed.packages()
if(length(pkgs[!inst])>0) install.packages(pkgs[!inst], dep=T)
#---
# Load
#---
library("tm")
library("textclean")
library("wordcloud")
library("SnowballC")
library("plotly")
library("RColorBrewer")
library("wordcloud2")
library("webshot")
library("htmlwidgets")
library("tidyverse")
library("tidytext")
```
## Wordclouds
### Load Data
Before importing, you should save your file as UNICODE UTF-8 or else you need to replace fancy punctuation yourself (using select, search, replace with simple, or space). This is perhaps easiest to do in a generic text editor like Notepad. For example:
* fancy quote marks (fancy â â or ' ')
* fancy en and em dashes (fancy -- or ---)
* fancy ellipsis (fancy ...)
* fancy list characters (bullet lists)
* and anything else fancy
Also, it is easiest to remove URL (web links) yourself.
Also, file must end with a blank line.
```{r echo=FALSE, include=FALSE}
filePath <- "Data_All_Combined.txt"
text <- readLines(filePath)
```
### Clean the text data
This file was imported after saving in Word as UNICODE UTF-8
```{r echo=FALSE, include=FALSE}
text <- tolower(text) # make lowercase
text <- gsub("[[:punct:]]", " ", text) # remove all punctuation
text <- gsub("[[:digit:]]", " ", text) # remove all numbers
text <- gsub("/", " ", text) # remove /
text <- gsub(";", " ", text) # remove ;
text <- gsub("'", "", text) # remove '
text <- gsub("''", "", text) # remove "
text <- gsub("@\\w+", " ", text) # remove at
text <- gsub("https:\\w+", " ", text) # links
text <- gsub("[ |\t]{2,}", " ", text) # tabs
text <- gsub("^ ", " ", text) # remove spaces at start
text <- gsub(" $", " ", text) # remove spaces at end
```
### Create Corpus
```{r echo=FALSE, include=FALSE}
library(tm)
docs <- tm::Corpus(tm::VectorSource(text))
tm::inspect(docs[1:3])
#---
toSpace <- tm::content_transformer(function (x , pattern ) gsub(pattern, " ", x))
#---
docs <- tm::tm_map(docs, tm::removeNumbers)
docs <- tm::tm_map(docs, tm::removePunctuation)
docs <- tm::tm_map(docs, tm::removePunctuation, ucp = TRUE)
docs <- tm::tm_map(docs, tm::content_transformer(tolower))
docs <- tm::tm_map(docs, toSpace, "/")
docs <- tm::tm_map(docs, toSpace, "@")
docs <- tm::tm_map(docs, toSpace, "\\|")
docs <- tm::tm_map(docs, toSpace, ";")
docs <- tm::tm_map(docs, gsub, pattern = ' t ', replacement = ' not ')
docs <- tm::tm_map(docs, gsub, pattern = ' ve ', replacement = ' have ')
docs <- tm::tm_map(docs, gsub, pattern = ' m ', replacement = ' am ')
docs <- tm::tm_map(docs, gsub, pattern = ' s ', replacement = ' ')
docs <- tm::tm_map(docs, tm::removeWords, tm::stopwords("english"))
docs <- tm::tm_map(docs, tm::removeWords, c("this", "that", "and", "the", "other", "thing"))
docs <- tm::tm_map(docs, tm::stripWhitespace)
docs
tm::inspect(docs[1:3])
```
### Create TDM (term-document-matrix)
TDM is a Table of counts
Some words end up being too long (too many letters). You can remove such words with smaller *maxLength*
You can change minimum number of times for a word to be included with *minFreq*
You can change the total number of words in the cloud with *maxTerms*
```{r echo=FALSE, include=FALSE}
maxTerms <- 200
minFreq <- 2
maxLength <- 25
tdm <- tm::TermDocumentMatrix(docs)
mtx <- as.matrix(tdm)
wrd <- sort(rowSums(mtx),decreasing=TRUE)
head(wrd)
dfALL <- data.frame(word = names(wrd), freq = wrd)
df <- subset(dfALL, subset = freq >= minFreq)
nrow(dfALL)
nrow(df)
df <- subset(df, subset = nchar(word) <= maxLength)
dfBackup <- df
df <- df[1:min(nrow(df),maxTerms), ]
```
### Optional: Stemming
```{r echo=FALSE, include=FALSE}
docx <- tm::tm_map(docs, tm::stemDocument)
xtdm <- tm::TermDocumentMatrix(docx)
xmtx <- as.matrix(xtdm)
xwrd <- sort(rowSums(xmtx),decreasing=TRUE)
xdf <- data.frame(word = names(xwrd), freq = xwrd)
xdf <- subset(xdf, subset = freq >= minFreq)
xdf <- subset(xdf, subset = nchar(word) < maxLength)
nrow(df)
nrow(xdf)
head(df)
head(xdf)
```
## Stats (Bar Chart, Other)
You can increase number of bars with *maxBars* (but 20 is a good max for bar chart)
```{r echo=FALSE}
maxBars <- 20
barplot(df[1:maxBars,]$freq, las = 2, names.arg = df[1:maxBars,]$word, col ="lightblue", main ="Most frequent words", ylab = "Word frequencies")
df[1:maxBars,]$word
#---
myLow <- 4
w <- sort(rowSums(mtx), decreasing = TRUE)
w <- subset(w, w >= myLow)
while (length(w) > 30) {
myLow <- myLow + 1
w <- subset(w, w >= myLow)
}
w
barplot(w, las = 2, col = rainbow(length(w)))
#---
w[1:3]
tm::findFreqTerms(tdm, lowfreq = myLow)
tm::findAssocs(tdm, terms = names(w[1:3]), corlimit = 0.7)
```
## Generate Basic Word Cloud
Wordcloud shows up in PLOTS output tab
Change myScale to make it fit better in the window
```{r echo=FALSE}
myScale <- c(3, 0.2)
myRot <- 0.35
mySeed <- 1804
set.seed(mySeed) # for reproducibility
wordcloud::wordcloud(words = df$word, freq = df$freq, min.freq = minFreq, max.words=maxTerms, scale=myScale, random.order=FALSE, rot.per=myRot, use.r.layout=FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2"))
#---
# to save the same plot as a file
#---
set.seed(mySeed) # for reproducibility
png(filename = "wordcloud0.png", width = 6.5, height = 6.5, units = "in", pointsize = 12, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default")
wordcloud::wordcloud(words = df$word, freq = df$freq, min.freq = minFreq, max.words=maxTerms, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2"))
dev.off()
```
## Advanced Word Clouds
BEWARE sometimes words do NOT fit and they are excluded... PAY ATTENTION
These show up in VIEWER output tab
* size = Font size, default is 1. The larger size means the bigger word.
* shape = Available presents are 'circle' (default), 'cardioid' (apple or heart shape curve, the most known polar equation), 'diamond' (alias of square), 'triangle-forward', 'triangle', 'pentagon', and 'star'.
* ellipticity = degree of "flatness" of the shape wordcloud2.js should draw.
#### Default
```{r echo=FALSE}
head(df)
wordcloud2::wordcloud2(data=df)
```
#### Stemmed
```{r echo=FALSE}
head(xdf)
wordcloud2::wordcloud2(data = xdf)
```
#### Resize
##### 0.2
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.2)
```
##### 0.4
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.4)
```
##### 0.8
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.8)
```
#### Emphasize Frequent words less
```{r echo=FALSE}
df1 <- df
df1$freq <- log(df$freq^3)
wordcloud2::wordcloud2(df1, size = 0.25)
```
#### Reverse Frequency
Lower frequency words in the middle
```{r echo=FALSE}
wordcloud2::wordcloud2(df[order(df$freq), ], size = 0.4)
```
#### Top N
##### ALL Terms Freq > 0
```{r echo=FALSE}
wordcloud2::wordcloud2(dfALL, size = 0.4)
```
##### Top 100 Terms
```{r echo=FALSE}
wordcloud2::wordcloud2(slice_max(df, order_by = freq, n=100), size = 0.4)
```
##### Top 40 Terms
```{r echo=FALSE}
wordcloud2::wordcloud2(slice_max(df, order_by = freq, n= 40), size = 0.4)
```
##### Top 20 Terms
```{r echo=FALSE}
wordcloud2::wordcloud2(slice_max(df, order_by = freq, n= 20), size = 0.4)
```
#### Shapes
##### Pentagon ALL
```{r echo=FALSE}
wordcloud2::wordcloud2(data = dfALL, size = 0.5, shape = 'pentagon', color = 'random-dark')
```
##### Pentagon Reduced
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.5, shape = 'pentagon', color = 'random-dark')
```
##### Triangle
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.4, shape = 'triangle', color = 'random-dark', rotateRatio = 0.8, minSize = minFreq)
```
##### Triangle-forward
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.4, shape = 'triangle-forward', color = 'random-dark')
```
##### Cardioid
```{r echo=FALSE}
wordcloud2::wordcloud2(data = df, size = 0.4, shape = 'cardioid', color = 'random-dark', ellipticity = 0.865)
```
#### Background
```{r echo=FALSE}
wordcloud2::wordcloud2(df, color='random-light', size = 0.4, backgroundColor="lavender")
```
#### Orientation
##### Perpendicular
```{r echo=FALSE}
wordcloud2::wordcloud2(df, size = 0.5, minRotation = -pi/2, maxRotation = -pi/2)
```
##### Rotate Some
```{r echo=FALSE}
wordcloud2::wordcloud2(df, size = 0.5, minRotation = -pi/6, maxRotation = -pi/6, rotateRatio = 0.4)
```
##### Rotate All
```{r echo=FALSE}
wordcloud2::wordcloud2(df, minRotation = pi/4, maxRotation = pi/4, size = 0.5, rotateRatio = 1)
```
##### Lower Ellipticity
```{r echo=FALSE}
wordcloud2::wordcloud2(df, size = 0.5, ellipticity = 0.1)
```
##### Higher Ellipticity
```{r echo=FALSE}
wordcloud2::wordcloud2(df, size = 0.5, ellipticity = 5.0)
```
#### Complete Command for a Star
```{r echo=FALSE}
starcloud <- wordcloud2::wordcloud2(data = dfALL, size = 0.3, minSize = 0, gridSize = 0, fontFamily = 'Segoe UI', fontWeight = 'normal', color = 'random-dark', backgroundColor = "white", minRotation = -pi/4, maxRotation = pi/4, shuffle = FALSE, rotateRatio = 0.4, shape = 'star', ellipticity = 0.8, widgetsize = NULL, figPath = NULL, hoverFunction = NULL)
starcloud
```
### STEP 7: Save Webshot
```{r eval=FALSE}
webshot::install_phantomjs()
#---
htmlwidgets::saveWidget(starcloud, "starcloud.html", selfcontained = F)
#---
webshot::webshot("starcloud.html", "starcloud.png", vwidth = 800, vheight = 800, delay = 5)
#---
webshot::webshot("starcloud.html", "starcloud.pdf", vwidth = 800, vheight = 800, delay = 5)
```
### STEP 8: More Advanced
#### Letter-Shaped Cloud
Sometimes need to run this 3-4 times to get it to work... and sometimes never works...
```{r inclue=FALSE, eval=FALSE}
wordcloud2::letterCloud(df, word="MW", size=0.8)
```
#### Shapes in Files
Sometimes need to run these 3-4 times to get them to work... and seomtimes never works...
```{r inclue=FALSE, eval=FALSE}
figFile <- "pumpkin.png"
wordcloud2::wordcloud2(dfALL, figPath = figFile, size = 1, color = 'darkorange')
```
## Sentiment Analysis
```{r echo=FALSE}
library(syuzhet)
s <- syuzhet::get_nrc_sentiment(text)
# head(syuzhet::get_sentences(text))
# head(s)
barplot(colSums(s), las = 2, col = rainbow(10), ylab = 'Count', main = 'Sentiment Scores')
```
## BIGRAM NETWORK
These show up in PLOTS output tab
```{r echo=FALSE}
myfilter <- 1
myNgram <- 2
df_text <- tibble(num = 1:length(text), text = text)
head(df_text)
summry <-
df_text %>%
unnest_tokens(bigram, text, token = "ngrams", n = myNgram) %>%
select(bigram)
summry %>% count(bigram, sort = TRUE)
bigram_split <- summry %>%
separate(col = bigram,
into = c("word1", "word2"),
sep = " ")
bigram_cleaned <-
bigram_split %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
bigram_cleaned <- bigram_cleaned %>%
count(word1, word2, sort = TRUE)
bigram_cleaned <- subset(bigram_cleaned, subset = !is.na(bigram_cleaned$word1))
bigram_cleaned <-
subset(bigram_cleaned, subset = n > myfilter)
bigram_cleaned %>%
unite(col = bigram,
word1, word2,
sep = " ")
tail(bigram_cleaned)
library(ggraph)
graph <- igraph::graph_from_data_frame(bigram_cleaned)
network_plot <- bigram_cleaned %>%
filter(n > myfilter) %>%
igraph::graph_from_data_frame()
wordnet <-
network_plot %>%
ggraph(layout = "kk") +
geom_edge_link(aes(col = factor(n))) +
geom_node_point() +
geom_node_text(aes(label = name))
wordnet
png(filename = "wordnetwork1.png", width = 6.5, height = 6.5, units = "in", pointsize = 11, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default")
wordnet
dev.off()
```
## Compare Participants Files
#### Compare STEP 1: Open
Open files in a subfolder
```{r echo=FALSE, include=FALSE}
docsComp <- tm::Corpus(tm::DirSource(directory = "Participants"))
# tm::inspect(docsComp)
```
#### Compare STEP 2: Clean
```{r echo=FALSE, include=FALSE}
docsComp <- tm::tm_map(docsComp, tm::content_transformer(tolower))
docsComp <- tm::tm_map(docsComp, tm::removeNumbers)
docsComp <- tm::tm_map(docsComp, tm::removePunctuation)
docsComp <- tm::tm_map(docsComp, tm::removePunctuation, ucp = TRUE)
docsComp <- tm::tm_map(docsComp, tm::stripWhitespace)
docsComp <- tm::tm_map(docsComp, tm::removeWords, tm::stopwords("english"))
docsComp <- tm::tm_map(docsComp, tm::removeWords, c("this", "that", "the", "other"))
# tm::inspect(docsComp)
```
#### Compare STEP 3: Matrix
```{r echo=FALSE, include=FALSE}
minFreqC <- 2
maxWordC <- 25
tdmComp <- tm::TermDocumentMatrix(docsComp)
mtxComp <- as.matrix(tdmComp)
wrdComp <- sort(rowSums(mtxComp),decreasing=TRUE)
dfC <- data.frame(word = names(wrdComp), freq = wrdComp)
dfC <- subset(dfC, subset = freq > minFreqC)
dfC <- subset(dfC, subset = nchar(word) < maxWordC)
head(dfC)
nrow(dfC)
dfComp <- dfC
```
### Compare STEP 4: Combine
```{r echo=FALSE}
wordcloud2::wordcloud2(data=dfC, size=0.5)
```
### Compare STEP 5: Compare
```{r echo=FALSE}
colnames(mtxComp) <- c("Participant_1", "Participant_2", "Participant_3")
png(filename = "comparecloud0.png", width = 6.5, height = 6.5, units = "in", pointsize = 12, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default")
wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2))
dev.off()
wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2))
```
### Compare STEP 6: Commonality
```{r echo=FALSE}
wordcloud::commonality.cloud(mtxComp, max.words = 200, random.order = FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2"))
```
## Compare Questions Files
#### Compare STEP 1: Open
Open files in a subfolder
```{r echo=FALSE, include=FALSE}
docsComp <- tm::Corpus(tm::DirSource(directory = "Questions"))
# tm::inspect(docsComp)
```
#### Compare STEP 2: Clean
```{r echo=FALSE, include=FALSE}
docsComp <- tm::tm_map(docsComp, tm::content_transformer(tolower))
docsComp <- tm::tm_map(docsComp, tm::removeNumbers)
docsComp <- tm::tm_map(docsComp, tm::removePunctuation)
docsComp <- tm::tm_map(docsComp, tm::removePunctuation, ucp = TRUE)
docsComp <- tm::tm_map(docsComp, tm::stripWhitespace)
docsComp <- tm::tm_map(docsComp, tm::removeWords, tm::stopwords("english"))
docsComp <- tm::tm_map(docsComp, tm::removeWords, c("this", "that", "the", "other"))
# tm::inspect(docsComp)
```
#### Compare STEP 3: Matrix
```{r echo=FALSE, include=FALSE}
minFreqC <- 2
maxWordC <- 25
tdmComp <- tm::TermDocumentMatrix(docsComp)
mtxComp <- as.matrix(tdmComp)
wrdComp <- sort(rowSums(mtxComp),decreasing=TRUE)
dfC <- data.frame(word = names(wrdComp), freq = wrdComp)
dfC <- subset(dfC, subset = freq > minFreqC)
dfC <- subset(dfC, subset = nchar(word) < maxWordC)
head(dfC)
nrow(dfC)
dfComp <- dfC
```
### Compare STEP 4: Combine
```{r echo=FALSE}
wordcloud2::wordcloud2(data=dfC, size=0.5)
```
### Compare STEP 5: Compare
```{r echo=FALSE}
png(filename = "comparecloudP.png", width = 6.5, height = 6.5, units = "in", pointsize = 12, bg = "white", res = 300, family = "", restoreConsole = TRUE, type = c("windows"), symbolfamily="default")
wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2))
dev.off()
wordcloud::comparison.cloud(mtxComp, max.words=100, random.order=FALSE, title.size=1, title.bg.colors=c("lightgray"), match.colors=TRUE, scale=c(3,0.2))
```
### Compare STEP 6: Commonality
```{r echo=FALSE}
wordcloud::commonality.cloud(mtxComp, max.words = 200, random.order = FALSE, colors=RColorBrewer::brewer.pal(8, "Dark2"))
```
# END