Tools

knitr::opts_chunk$set(echo = TRUE, include=TRUE, message=FALSE, warning=FALSE)
#from session 1 & 2
library(tidyverse)
library(udpipe)
library(flextable)
library(cowplot)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(syuzhet)  #analyse du sentimeent

# new for session 2
library(FactoMineR)
library(factoextra)
library(igraph)
library(ggwordcloud)
library(ggrepel)

library(Rtsne)
library(tidytext)

# new for session 3
library(cleanNLP)
library(text2vec)


theme_set(theme_minimal())

t1=Sys.time()

The basic LDA

proposed by Blei, Ng, and Jordan (2003)

a few theory

LDA

Application

Prepare the data (a TfIdf approach with cleanNLP elegant function)

UD<-readRDS("./Data/UD.rds")

#library(cleanNLP) #an other technics for tf_idf
tf <- UD%>%
  filter(upos %in% c("NOUN", "VERB")) %>%
  cnlp_utils_tfidf(min_df = 0.05, max_df = 0.95, tf_weight = "raw")

The LDA model with text2vec

#library(text2vec)
lda_model = LDA$new(n_topics = 12, doc_topic_prior = 0.1, topic_word_prior = 0.01)


set.seed(67) #pour la reproducibilité des résultats

#On définit les paramètres du processus d'estimation : 
##n_iter = le nombre d'itérations
##convergence_tol =le seuil de convergence
doc_topic_distr = 
  lda_model$fit_transform(x = tf, 
                          n_iter = 1000, 
                          convergence_tol = 0.001, 
                          n_check_convergence = 25, 
                          progressbar = TRUE)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |=======                                                               |  11%
  |                                                                            
  |========                                                              |  11%
  |                                                                            
  |========                                                              |  12%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |=========                                                             |  13%
  |                                                                            
  |=========                                                             |  14%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |==========                                                            |  15%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===========                                                           |  16%
  |                                                                            
  |============                                                          |  16%
  |                                                                            
  |============                                                          |  17%
  |                                                                            
  |============                                                          |  18%
  |                                                                            
  |=============                                                         |  18%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |==============                                                        |  19%
  |                                                                            
  |==============                                                        |  20%
  |                                                                            
  |==============                                                        |  21%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |===============                                                       |  22%
  |                                                                            
  |================                                                      |  22%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |================                                                      |  24%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=================                                                     |  25%
  |                                                                            
  |==================                                                    |  25%
  |                                                                            
  |==================                                                    |  26%
  |                                                                            
  |===================                                                   |  26%
  |                                                                            
  |===================                                                   |  27%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |====================                                                  |  28%
  |                                                                            
  |====================                                                  |  29%
  |                                                                            
  |=====================                                                 |  29%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=====================                                                 |  31%
  |                                                                            
  |======================                                                |  31%
  |                                                                            
  |======================                                                |  32%
  |                                                                            
  |=======================                                               |  32%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=======================                                               |  34%
  |                                                                            
  |========================                                              |  34%
  |                                                                            
  |========================                                              |  35%
  |                                                                            
  |=========================                                             |  35%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |==========================                                            |  36%
  |                                                                            
  |==========================                                            |  37%
  |                                                                            
  |==========================                                            |  38%
  |                                                                            
  |===========================                                           |  38%
  |                                                                            
  |===========================                                           |  39%
  |                                                                            
  |============================                                          |  39%
  |                                                                            
  |============================                                          |  40%
  |                                                                            
  |============================                                          |  41%
  |                                                                            
  |=============================                                         |  41%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |==============================                                        |  42%
  |                                                                            
  |==============================                                        |  43%
  |                                                                            
  |==============================                                        |  44%
  |                                                                            
  |===============================                                       |  44%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |================================                                      |  45%
  |                                                                            
  |================================                                      |  46%
  |                                                                            
  |=================================                                     |  46%
  |                                                                            
  |=================================                                     |  47%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |==================================                                    |  48%
  |                                                                            
  |==================================                                    |  49%
  |                                                                            
  |===================================                                   |  49%
  |                                                                            
  |===================================                                   |  50%
  |                                                                            
  |===================================                                   |  51%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |====================================                                  |  52%
  |                                                                            
  |=====================================                                 |  52%
  |                                                                            
  |=====================================                                 |  53%
  |                                                                            
  |=====================================                                 |  54%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |======================================                                |  55%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |=======================================                               |  56%
  |                                                                            
  |========================================                              |  56%
  |                                                                            
  |========================================                              |  57%
  |                                                                            
  |========================================                              |  58%
  |                                                                            
  |=========================================                             |  58%
  |                                                                            
  |=========================================                             |  59%
  |                                                                            
  |==========================================                            |  59%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |======================================================================| 100%INFO  [22:06:12.834] early stopping at 600 iteration
## 
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |                                                                      |   1%
  |                                                                            
  |=                                                                     |   1%
  |                                                                            
  |=                                                                     |   2%
  |                                                                            
  |==                                                                    |   2%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |==                                                                    |   4%
  |                                                                            
  |===                                                                   |   4%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |====                                                                  |   5%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |=====                                                                 |   6%
  |                                                                            
  |=====                                                                 |   7%
  |                                                                            
  |=====                                                                 |   8%
  |                                                                            
  |======                                                                |   8%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=======                                                               |   9%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |======================================================================| 100%INFO  [22:06:17.582] early stopping at 100 iteration
#description des topics en fonction d'un degré de pertinence = lamba ( lambda =1 probabilité d'obtenir le terme sachant le topic)

Topic description

lda_res<-as.data.frame(lda_model$get_top_words(n = 15, lambda = 0.30))
lda_res$rank<-as.numeric(row.names(lda_res))

lda_res<-lda_res%>% gather(variable, value, -rank)

ggplot(lda_res, aes(x=variable, y= rank, group =  value , label = value)) + 
  scale_y_reverse() + 
  geom_text(aes(color=variable,size=sqrt(26-rank)))+
  scale_color_hue()+
  guides(color="none",size="none")+
  labs(x="topics", y="par ordre de pertinence")

# un nuage de mot plutôt

A more interactive description (that we cannot edit here… need to be in rstudio)

library(LDAvis)
lda_model$plot() #mode interactif

the optimal number of topics

No optimal analytics methods. Okham rule.

but some systematic approachs by comparing index for different solutions.

package ldatuning

Some other models

Topics model are popular, and quickly new models were suggested.

STM : regression like

Introduce de control a third variable, as a regression approach model, very usefull to introduce a time parameter. But also let the topic being correlated, which open to hierarchical topics

semi-supervised lda

We could help the solution by predefining key tokens. Seed_lda

Notes

Beware to computing time! Best to sample for testing. (come back to the beginning)

t2=Sys.time()
t<- t2-t1
print(t)
## Time difference of 49.26987 secs

See you to later and go to session 4

Some exercises before, for training :

  • Explore the adjectives world
  • Compare administratiion (main) in term of qualificatives.

References

Blei, David M., Andrew Y. Ng, and Michael I. Jordan. 2003. “Latent Dirichlet Allocation.” J. Mach. Learn. Res. 3 (March): 993–1022. http://dl.acm.org/citation.cfm?id=944919.944937.