Tools

knitr::opts_chunk$set(echo = TRUE, include=TRUE, message=FALSE, warning=FALSE)
#from session 1
library(tidyverse)
library(udpipe)
library(flextable)
library(cowplot)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(syuzhet)  #analyse du sentimeent

# new for session 2
library(FactoMineR)
library(factoextra)
library(igraph)
library(ggwordcloud)
library(ggrepel)

library(Rtsne)
library(tidytext)
theme_set(theme_minimal())

t1=Sys.time()

Dfm & Co-occurence matrix

We start from the annotation task. First objective is to reconstitute text with lemma. Because we are first interesting par the content, we will work mainly with noun, proper name and verbs. Things, people and action. We will add also adjectives as there are more attributives than qualifying. (fact versus judgement technics) Van der Maaten, Laurens and Hinton (2008)

#read the file and select tokens
UD<-readRDS("./Data/UD.rds")
df<-UD %>%
  select(doc_id, lemma, upos)%>%
  filter(upos=="NOUN"|upos=="PROPN"|upos=="VERB"|upos=="ADJ")%>%
  group_by(doc_id)%>%
  summarise(text=paste(lemma, collapse=" "))

head(df,5)

## # A tibble: 5 x 2
##   doc_id   text                                                                 
##   <chr>    <chr>                                                                
## 1 doc1     accoucher février fille inscrire CAF faire démarche temps heure cont~
## 2 doc10    contrôle situation rendre compte erreur faire déclaration impôt dema~
## 3 doc100   partir retraite anticipé falloir papier avoir droit dur fois contact~
## 4 doc1000  adresser réclamation recommander sujet remboursement soin réaliser é~
## 5 doc10000 délai paiement limite infraction faire amende éditer date courrier r~

#add doc_var  variables

Come back to quanteda, making a dfm and a cfm.

corpus <- corpus(df$text) # docvars(corpus, field = "doc_id")
summary(corpus)

## Corpus consisting of 16576 documents, showing 100 documents:
## 
##     Text Types Tokens Sentences
##    text1    34     36         1
##    text2    34     38         1
##    text3    41     44         1
##    text4    12     12         1
##    text5    21     28         1
##    text6    52     75         1
##    text7    17     17         1
##    text8    51     63         1
##    text9    57     70         1
##   text10    34     36         1
##   text11    43     50         1
##   text12    54     67         1
##   text13    12     13         1
##   text14    39     48         1
##   text15    29     33         1
##   text16    13     13         1
##   text17    13     13         1
##   text18    11     11         1
##   text19    14     14         1
##   text20    16     16         1
##   text21    49     57         2
##   text22    22     24         1
##   text23    47     59         1
##   text24    58     70         1
##   text25    55     72         1
##   text26    77    108         1
##   text27    36     41         1
##   text28    29     30         1
##   text29    51     67         1
##   text30    21     21         1
##   text31    42     49         1
##   text32    25     27         1
##   text33    46     60         1
##   text34    35     39         1
##   text35    10     11         1
##   text36    45     60         1
##   text37    24     31         1
##   text38    12     16         1
##   text39    22     24         1
##   text40    45     62         3
##   text41    35     43         1
##   text42    35     40         1
##   text43    28     32         1
##   text44    37     50         1
##   text45    50     57         1
##   text46    33     40         1
##   text47    14     14         1
##   text48     6      6         1
##   text49    32     34         1
##   text50     9      9         1
##   text51    56     69         1
##   text52    55     58         1
##   text53    21     26         1
##   text54    20     23         1
##   text55    45     47         1
##   text56    12     13         1
##   text57    47     63         1
##   text58    37     42         1
##   text59    21     23         1
##   text60    41     53         1
##   text61    15     20         1
##   text62    30     37         1
##   text63    38     55         1
##   text64    18     21         1
##   text65    35     47         1
##   text66    39     47         1
##   text67    38     42         1
##   text68    38     43         1
##   text69    12     12         1
##   text70    36     41         1
##   text71    21     27         1
##   text72    16     16         1
##   text73    17     18         1
##   text74    67     81         1
##   text75    13     14         1
##   text76    17     22         1
##   text77    16     16         1
##   text78    27     27         1
##   text79    56     79         1
##   text80    30     33         1
##   text81    14     14         1
##   text82    32     39         1
##   text83    42     42         1
##   text84    30     34         1
##   text85    28     37         1
##   text86    15     15         1
##   text87    59     74         1
##   text88    25     30         1
##   text89    14     14         1
##   text90    16     16         1
##   text91    12     12         1
##   text92    25     29         1
##   text93    29     34         1
##   text94    33     36         1
##   text95    48     76         1
##   text96    35     44         1
##   text97    25     26         1
##   text98    40     55         1
##   text99    36     47         1
##  text100    25     25         1

dfm<- corpus %>%
  tokens() %>% 
  tokens_remove(stopwords("french"))%>%
  dfm()

dfm<- dfm %>%
    dfm_trim(min_termfreq = 50, verbose = FALSE)  
#a rule of thumb frequency must be at least2 times the density of 2 for a Thousand

#wordcloud
set.seed(100)
textplot_wordcloud(dfm)

dfm_df<- as.data.frame(dfm) 

dim(dfm_df)

## [1] 16576  1659

head(dfm,8)

## Document-feature matrix of: 8 documents, 1,658 features (98.20% sparse) and 0 docvars.
##        features
## docs    février fille inscrire caf faire démarche temps heure contacter
##   text1       1     1        1   2     1        1     1     1         1
##   text2       0     0        0   1     1        0     0     0         0
##   text3       0     0        0   0     0        0     1     0         1
##   text4       0     0        0   0     0        0     0     0         0
##   text5       0     0        0   0     1        0     0     0         0
##   text6       0     0        0   1     2        0     0     0         0
##        features
## docs    téléphone
##   text1         2
##   text2         1
##   text3         1
##   text4         0
##   text5         0
##   text6         0
## [ reached max_ndoc ... 2 more documents, reached max_nfeat ... 1,648 more features ]

#counting words
dfm_count<-dfm_df %>%
  pivot_longer(-doc_id,names_to = "word", values_to = "n")%>%
  group_by(word)%>% 
  summarise(Frequency=sum(n))

Map the tokens

Issue is that with wordcloud position is meaningless, just random and paving..

Goal : represent similarity between words based on their co-occurences in a small dimension space.

PCA approach

Come back to PCCA ! (we pass CA and MCA)

# with library(FactoMineR)
foo<- dfm_df %>%
  column_to_rownames(var="doc_id")

#PCA
res.pca <- PCA(foo[,c(1:1658)], ncp=10,graph=FALSE) #could be pretty long
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 10))

#label selection factor
a=.2

foo1<-as.data.frame(res.pca$var$coord) %>% 
  rownames_to_column(var="label") %>%
  filter(Dim.1>a | Dim.1< -a | Dim.2>a | Dim.2< - a)

ggplot(foo1, aes(x=Dim.1, y=Dim.2))+
  geom_text(aes(label=label),size=2)

foo1<-as.data.frame(res.pca$var$coord) %>% 
  rownames_to_column(var="label") %>%
  filter(Dim.3> a | Dim.3< -a | Dim.4> a | Dim.4< -a)

ggplot(foo1, aes(x=Dim.1, y=Dim.2))+
  geom_text(aes(label=label),size=2)

#typo with kmnes

foo<-as.data.frame(res.pca$var$coord)
resKM <- kmeans(foo, 20, nstart = 25, trace=0)


foo<-as.data.frame(resKM$cluster) %>%
  rename(cluster=1)%>% 
  rownames_to_column(var="word") %>%
  left_join(dfm_count)
foo$cluster<-as.factor(foo$cluster)


set.seed(42) #for reproducibility

#library(ggwordcloud)
ggplot(foo, aes(label = word, size=Frequency, group = cluster)) +
  geom_text_wordcloud() +
  scale_size_area(max_size = 5) +
  facet_wrap(vars(cluster), ncol=4)+
  theme_minimal()+ 
  labs(title=NULL) #comment enlever la numérotation ?

ggsave("./Images/cluster.jpeg", width = 28, height = 20, units = "cm")

Tsne Approach

A legacy of MDS (Multidimensional scaling) due to Van der maaten. Source is

T-sne project large number of objects in a small dimension space as traditional MDS, but with a principle of density, that dilate space where point are dense, and contract it when is not.

https://cran.r-project.org/web/packages/Rtsne/Rtsne.pdf

The best alternative seems to be UMAP.

#we need to transpose the matrix
foo<-dfm_df %>%
  select(-doc_id)%>%t()

#just to keep words aside
word<-as.data.frame(rownames(foo))%>%
  rename(word=1)

Model and 2D tsne representation

set.seed(42) # Sets seed for reproducibility
tsne_out <- Rtsne(foo,
                  initial_dims = 50,
                  perplexity = 50,
                  partial_pca=TRUE,
                  theta=.5,
                  num_threads=4, 
                  verbose=1)

## Performing PCA
## Read the 1658 x 50 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.30 seconds (sparsity = 0.164029)!
## Learning embedding...
## Iteration 50: error is 65.491716 (50 iterations in 0.19 seconds)
## Iteration 100: error is 65.491716 (50 iterations in 0.14 seconds)
## Iteration 150: error is 65.491716 (50 iterations in 0.15 seconds)
## Iteration 200: error is 65.491716 (50 iterations in 0.17 seconds)
## Iteration 250: error is 65.491716 (50 iterations in 0.20 seconds)
## Iteration 300: error is 2.040402 (50 iterations in 0.17 seconds)
## Iteration 350: error is 1.699851 (50 iterations in 0.12 seconds)
## Iteration 400: error is 1.650918 (50 iterations in 0.13 seconds)
## Iteration 450: error is 1.627877 (50 iterations in 0.11 seconds)
## Iteration 500: error is 1.616187 (50 iterations in 0.14 seconds)
## Iteration 550: error is 1.609548 (50 iterations in 0.11 seconds)
## Iteration 600: error is 1.606150 (50 iterations in 0.12 seconds)
## Iteration 650: error is 1.602474 (50 iterations in 0.11 seconds)
## Iteration 700: error is 1.600997 (50 iterations in 0.12 seconds)
## Iteration 750: error is 1.599377 (50 iterations in 0.11 seconds)
## Iteration 800: error is 1.597540 (50 iterations in 0.11 seconds)
## Iteration 850: error is 1.596278 (50 iterations in 0.11 seconds)
## Iteration 900: error is 1.595396 (50 iterations in 0.11 seconds)
## Iteration 950: error is 1.594218 (50 iterations in 0.11 seconds)
## Iteration 1000: error is 1.593394 (50 iterations in 0.11 seconds)
## Fitting performed in 2.67 seconds.

tsne_out1<-as.data.frame(tsne_out$Y)
tsne_out2<-cbind(word,tsne_out1)%>%
  left_join(dfm_count)%>%
  filter(Frequency<7000 & Frequency>150)

ggplot(tsne_out2, aes(x=V1, y=V2))+
  geom_text_repel(aes(label=word, 
                      size=log10(Frequency),
                      alpha=log10(Frequency)),
                  color="black",
                  max.overlap=Inf)+
  theme(legend.position = "none")+
  labs(x=NULL, y=NULL)+  
  scale_size(range = c(.1, 3.5))

ggsave("./Images/tsne.jpeg", width = 28, height = 20, units = "cm")

Semantic Networks

A quick and dirty quanteda function textplot_network

It is based on igraph. that will be the end of the lecture.

##coocurrence computing
tag_fcm <- fcm(dfm) 
head(tag_fcm)

## Feature co-occurrence matrix of: 6 by 1,658 features.
##           features
## features   février fille inscrire  caf faire démarche temps heure contacter
##   février       92    27       27  180   544      111   108    45       109
##   fille          0   339       52  128   618      180   172    95        81
##   inscrire       0     0      100   52   449      128    91    56        65
##   caf            0     0        0 1652  1669      342   396   166       406
##   faire          0     0        0    0  5666     3362  2158  1502      1750
##   démarche       0     0        0    0     0      897   627   407       498
##           features
## features   téléphone
##   février        128
##   fille          106
##   inscrire        95
##   caf            490
##   faire         2447
##   démarche       665
## [ reached max_nfeat ... 1,648 more features ]

#select top tags
toptag <- names(topfeatures(dfm, 100)) 

topgat_fcm <- fcm_select(tag_fcm, pattern = toptag) #select 500 more frequent words

textplot_network(topgat_fcm, 
                 min_freq = 0.5, 
                 edge_alpha = 0.01, 
                 edge_size = 0.1,
                vertex_labelsize = 2.5)

ggsave("./Images/network.jpeg", width = 28, height = 20, units = "cm")

Tfidf matrix

Raw frequencies could be misleading as frequent word are not always distinctive. The idea is to weigth term frequency with document frequency, which is the numbre of document in which you could observe the terms .

\[TfIdf_{i}=Tf_{i}\ln(\frac{N}{n_{i}})\] where N is the Total number of document and \(n_{i}\) the number of document in which term i is present.

To change style of writing operation, we will use tidytext. The code come from

foo<-UD %>%
  filter(upos=="NOUN"|upos=="PROPN"|upos=="VERB"|upos=="ADJ")%>%
  group_by(doc_id)%>%
  count(doc_id, lemma, sort=TRUE)

total_words <- foo %>% 
  group_by(doc_id) %>% 
  summarize(total = sum(n))

Avis_words <- left_join(foo, total_words) %>%
  mutate(Term_frequency=n/total)%>%
  group_by(doc_id)%>%
  summarise(word=lemma,rank = row_number(),
            n=n,
            Term_frequency=Term_frequency)
Avis_words

## # A tibble: 567,622 x 5
## # Groups:   doc_id [16,576]
##    doc_id word        rank     n Term_frequency
##    <chr>  <chr>      <int> <int>          <dbl>
##  1 doc1   avoir          1     2         0.0556
##  2 doc1   téléphone      2     2         0.0556
##  3 doc1   10mn           3     1         0.0278
##  4 doc1   1500euro       4     1         0.0278
##  5 doc1   accoucher      5     1         0.0278
##  6 doc1   aide           6     1         0.0278
##  7 doc1   allocation     7     1         0.0278
##  8 doc1   APL            8     1         0.0278
##  9 doc1   attente        9     1         0.0278
## 10 doc1   bête          10     1         0.0278
## # ... with 567,612 more rows

Just test on two documents to get a flavour of the story.

Distribution and zipf like plot.

Avis_words%>%
  filter(doc_id=="doc12441"|doc_id=="doc12912") %>% 
  ggplot(aes(x=Term_frequency, fill = doc_id)) +
  geom_histogram(show.legend = FALSE) +
#  xlim(NA, 0.0009) +
  facet_wrap(~doc_id, ncol = 2, scales = "free_y")

Avis_words%>%
  filter(doc_id=="doc12441"|doc_id=="doc12912") %>% 
  ggplot(aes(x=rank,Term_frequency, color = doc_id)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

tfidf computing with tidytext function bind_tf_idf

Avis_tf_idf <- Avis_words %>%
  bind_tf_idf(word, doc_id, n)

Avis_tf_idf %>%
   select(-rank,n,Term_frequency) %>% 
  filter(tf_idf<0.5)%>%
  arrange(desc(tf_idf))

## # A tibble: 560,204 x 7
## # Groups:   doc_id [16,528]
##    doc_id   word           n Term_frequency     tf   idf tf_idf
##    <chr>    <chr>      <int>          <dbl>  <dbl> <dbl>  <dbl>
##  1 doc12268 devenir        1         0.125  0.125   4.00  0.500
##  2 doc8600  pension        1         0.125  0.125   4.00  0.500
##  3 doc13167 Iran           3         0.06   0.06    8.33  0.500
##  4 doc10978 vingtaine      1         0.0769 0.0769  6.50  0.500
##  5 doc12466 précarité      1         0.0769 0.0769  6.50  0.500
##  6 doc14037 essaye         1         0.0769 0.0769  6.50  0.500
##  7 doc14048 covidre        1         0.0769 0.0769  6.50  0.500
##  8 doc2286  armer          1         0.0769 0.0769  6.50  0.500
##  9 doc555   réinscrire     2         0.0769 0.0769  6.50  0.500
## 10 doc6155  Séville        3         0.0769 0.0769  6.50  0.500
## # ... with 560,194 more rows

Avis_tf_idf%>%
  filter(doc_id=="doc12441"|doc_id=="doc12912") %>% 
  group_by(doc_id) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = doc_id)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~doc_id, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

Avis_tf_idf%>%
  ggplot(aes(tf_idf))+
  geom_histogram(binwidth = 0.002)+
  xlim(0,1)

Reuse of tsne procedure. But first compute distance

foo<-Avis_tf_idf %>% 
  left_join(dfm_count) %>%
  filter(Frequency>50)%>%
  select(doc_id, word, tf_idf) %>% 
  pivot_wider(doc_id, names_from = "word", values_from = "tf_idf")%>%
 column_to_rownames(var="doc_id") %>%
  mutate(
    across(everything(), ~replace_na(.x, 0))
  )

#just to keep words aside

foo<-t(foo)
word<-as.data.frame(rownames(foo))%>%
  rename(word=1)

then apply the model

set.seed(42) # Sets seed for reproducibility
tsne_out <- Rtsne(foo,
                  initial_dims = 50,
                  perplexity = 50,
                  partial_pca=TRUE,
                  theta=.5,
                  num_threads=4, 
                  verbose=1,
                  check_duplicates = FALSE)

## Performing PCA
## Read the 1609 x 50 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.25 seconds (sparsity = 0.172802)!
## Learning embedding...
## Iteration 50: error is 64.695991 (50 iterations in 0.20 seconds)
## Iteration 100: error is 64.696010 (50 iterations in 0.19 seconds)
## Iteration 150: error is 64.696014 (50 iterations in 0.21 seconds)
## Iteration 200: error is 64.696001 (50 iterations in 0.24 seconds)
## Iteration 250: error is 64.696010 (50 iterations in 0.29 seconds)
## Iteration 300: error is 2.906426 (50 iterations in 0.23 seconds)
## Iteration 350: error is 1.785382 (50 iterations in 0.13 seconds)
## Iteration 400: error is 1.724227 (50 iterations in 0.12 seconds)
## Iteration 450: error is 1.691735 (50 iterations in 0.11 seconds)
## Iteration 500: error is 1.677824 (50 iterations in 0.12 seconds)
## Iteration 550: error is 1.666957 (50 iterations in 0.12 seconds)
## Iteration 600: error is 1.658279 (50 iterations in 0.12 seconds)
## Iteration 650: error is 1.652063 (50 iterations in 0.12 seconds)
## Iteration 700: error is 1.648806 (50 iterations in 0.12 seconds)
## Iteration 750: error is 1.645790 (50 iterations in 0.12 seconds)
## Iteration 800: error is 1.643270 (50 iterations in 0.12 seconds)
## Iteration 850: error is 1.642125 (50 iterations in 0.11 seconds)
## Iteration 900: error is 1.640837 (50 iterations in 0.11 seconds)
## Iteration 950: error is 1.639355 (50 iterations in 0.11 seconds)
## Iteration 1000: error is 1.638975 (50 iterations in 0.11 seconds)
## Fitting performed in 3.00 seconds.

tsne_out1<-tsne_out$Y
tsne_out2<-as.data.frame(cbind(word,tsne_out1) )%>%
  left_join(dfm_count)%>%
  filter(Frequency<7000 & Frequency>100)

ggplot(tsne_out2, aes(x=`1`, y=`2` ))+
  geom_text_repel(aes(label=word, 
                      size=log10(Frequency),
                      alpha=log10(Frequency)),
                  color="black",
                  max.overlap=Inf)+
  theme(legend.position = "none")+
  labs(x=NULL, y=NULL)+  
  scale_size(range = c(.1, 3))

ggsave("./Images/tsne2.jpeg", width = 28, height = 20, units = "cm")

Compare groups

Playing with Keyness, a statistics fo find discriminant words across group of text. https://quanteda.io/reference/textstat_keyness.html

( from NPS game)

# 1 corpus definition
df_work<-readRDS("./Data/df_work.rds")

corpus<-corpus(df_work,text_field ="description")

# 2 tokenisation (per group)

toks <- tokens(corpus, remove_punct = TRUE) %>% 
    tokens_remove(pattern = stopwords("fr"))%>%
   tokens_remove(pattern="très*") %>%
    tokens_group(groups = ressenti)

# 3 dfm building 

dfm <- dfm(toks) %>%   
  dfm_trim(min_termfreq = 40, verbose = FALSE)

# 4 afficher le wordcloud

textplot_wordcloud(dfm,comparison = TRUE, color = col)

Keyness for positive and negative.

A clear answer !

# Create a dfm per group
dfm <-toks %>%
    tokens_group(groups = ressenti) %>% 
  dfm()

# Calculate keyness and determine "Positif" as target group againts all other categories
result_keyness <- textstat_keyness(dfm, target = "Positif") %>% 
  filter (n_target>20)

# Plot estimated word keyness
g1<-textplot_keyness(result_keyness,   n = 20L, labelsize = 3,   show_legend = FALSE, 
                     show_reference = TRUE,   color = c("Darkgreen", "gray"))+
  labs(x=NULL)

# Calculate keyness and determine "Négatif" as target group againts all other categories
result_keyness <- textstat_keyness(dfm, target = "Négatif") %>% 
  filter (n_target>20)

# Plot estimated word keyness
g2<-textplot_keyness(result_keyness,   n = 20L, labelsize = 3,   show_legend = FALSE, 
                     show_reference = TRUE,   color = c("firebrick", "gray"))+
  labs(x=NULL)

plot_grid(
  g1, g2,
  labels = "AUTO"
  )

ggsave("./Images/Keyness.jpeg", width = 28, height = 20, units = "cm")

More about networks

a complete graph with igraph

Notes

Beware to computing time! Best to sample for testing. (come back to the beginning)

t2=Sys.time()
t<- t2-t1
print(t)

## Time difference of 10.04649 mins

See you to later and go to session 3

Some exercises before, for training :

Explore the adjectives world
Compare administration (main) in term of qualificatives.

References

Van der Maaten, Laurens, and Geoffrey Hinton. 2008. “Visualizing Data Using t-SNE.” Journal of Machine Learning, 2579–2605.

Session 2 : semantic space

cb,jcrm,bc,oc

2022-11-01