knitr::opts_chunk$set(echo = TRUE, include=TRUE, message=FALSE, warning=FALSE)
#from session 1 & 2
library(tidyverse)
library(udpipe)
library(flextable)
library(cowplot)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(syuzhet) #analyse du sentimeent
# new for session 2
library(FactoMineR)
library(factoextra)
library(igraph)
library(ggwordcloud)
library(ggrepel)
library(Rtsne)
library(tidytext)
# new for session 3
library(cleanNLP)
library(text2vec)
theme_set(theme_minimal())
t1=Sys.time()
proposed by Blei, Ng, and Jordan (2003)
LDA
Prepare the data (a TfIdf approach with cleanNLP elegant
function)
UD<-readRDS("./Data/UD.rds")
#library(cleanNLP) #an other technics for tf_idf
tf <- UD%>%
filter(upos %in% c("NOUN", "VERB")) %>%
cnlp_utils_tfidf(min_df = 0.05, max_df = 0.95, tf_weight = "raw")
The LDA model with text2vec
#library(text2vec)
lda_model = LDA$new(n_topics = 12, doc_topic_prior = 0.1, topic_word_prior = 0.01)
set.seed(67) #pour la reproducibilité des résultats
#On définit les paramètres du processus d'estimation :
##n_iter = le nombre d'itérations
##convergence_tol =le seuil de convergence
doc_topic_distr =
lda_model$fit_transform(x = tf,
n_iter = 1000,
convergence_tol = 0.001,
n_check_convergence = 25,
progressbar = TRUE)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|======================================================================| 100%INFO [22:06:12.834] early stopping at 600 iteration
##
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======================================================================| 100%INFO [22:06:17.582] early stopping at 100 iteration
#description des topics en fonction d'un degré de pertinence = lamba ( lambda =1 probabilité d'obtenir le terme sachant le topic)
Topic description
lda_res<-as.data.frame(lda_model$get_top_words(n = 15, lambda = 0.30))
lda_res$rank<-as.numeric(row.names(lda_res))
lda_res<-lda_res%>% gather(variable, value, -rank)
ggplot(lda_res, aes(x=variable, y= rank, group = value , label = value)) +
scale_y_reverse() +
geom_text(aes(color=variable,size=sqrt(26-rank)))+
scale_color_hue()+
guides(color="none",size="none")+
labs(x="topics", y="par ordre de pertinence")
# un nuage de mot plutôt
A more interactive description (that we cannot edit here… need to be in rstudio)
library(LDAvis)
lda_model$plot() #mode interactif
No optimal analytics methods. Okham rule.
but some systematic approachs by comparing index for different solutions.
ldatuningTopics model are popular, and quickly new models were suggested.
Introduce de control a third variable, as a regression approach model, very usefull to introduce a time parameter. But also let the topic being correlated, which open to hierarchical topics
We could help the solution by predefining key tokens. Seed_lda
Beware to computing time! Best to sample for testing. (come back to the beginning)
t2=Sys.time()
t<- t2-t1
print(t)
## Time difference of 49.26987 secs
See you to later and go to session 4
Some exercises before, for training :