knitr::opts_chunk$set(echo = TRUE, include=TRUE, message=FALSE, warning=FALSE)
#from session 1
library(tidyverse)
library(udpipe)
library(flextable)
library(cowplot)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(syuzhet) #analyse du sentimeent
# new for session 2
library(FactoMineR)
library(factoextra)
library(igraph)
library(ggwordcloud)
library(ggrepel)
library(Rtsne)
library(tidytext)
theme_set(theme_minimal())
t1=Sys.time()
We start from the annotation task. First objective is to reconstitute text with lemma. Because we are first interesting par the content, we will work mainly with noun, proper name and verbs. Things, people and action. We will add also adjectives as there are more attributives than qualifying. (fact versus judgement technics) Van der Maaten, Laurens and Hinton (2008)
#read the file and select tokens
UD<-readRDS("./Data/UD.rds")
df<-UD %>%
select(doc_id, lemma, upos)%>%
filter(upos=="NOUN"|upos=="PROPN"|upos=="VERB"|upos=="ADJ")%>%
group_by(doc_id)%>%
summarise(text=paste(lemma, collapse=" "))
head(df,5)
## # A tibble: 5 x 2
## doc_id text
## <chr> <chr>
## 1 doc1 accoucher février fille inscrire CAF faire démarche temps heure cont~
## 2 doc10 contrôle situation rendre compte erreur faire déclaration impôt dema~
## 3 doc100 partir retraite anticipé falloir papier avoir droit dur fois contact~
## 4 doc1000 adresser réclamation recommander sujet remboursement soin réaliser é~
## 5 doc10000 délai paiement limite infraction faire amende éditer date courrier r~
#add doc_var variables
Come back to quanteda, making a dfm and a cfm.
corpus <- corpus(df$text) # docvars(corpus, field = "doc_id")
summary(corpus)
## Corpus consisting of 16576 documents, showing 100 documents:
##
## Text Types Tokens Sentences
## text1 34 36 1
## text2 34 38 1
## text3 41 44 1
## text4 12 12 1
## text5 21 28 1
## text6 52 75 1
## text7 17 17 1
## text8 51 63 1
## text9 57 70 1
## text10 34 36 1
## text11 43 50 1
## text12 54 67 1
## text13 12 13 1
## text14 39 48 1
## text15 29 33 1
## text16 13 13 1
## text17 13 13 1
## text18 11 11 1
## text19 14 14 1
## text20 16 16 1
## text21 49 57 2
## text22 22 24 1
## text23 47 59 1
## text24 58 70 1
## text25 55 72 1
## text26 77 108 1
## text27 36 41 1
## text28 29 30 1
## text29 51 67 1
## text30 21 21 1
## text31 42 49 1
## text32 25 27 1
## text33 46 60 1
## text34 35 39 1
## text35 10 11 1
## text36 45 60 1
## text37 24 31 1
## text38 12 16 1
## text39 22 24 1
## text40 45 62 3
## text41 35 43 1
## text42 35 40 1
## text43 28 32 1
## text44 37 50 1
## text45 50 57 1
## text46 33 40 1
## text47 14 14 1
## text48 6 6 1
## text49 32 34 1
## text50 9 9 1
## text51 56 69 1
## text52 55 58 1
## text53 21 26 1
## text54 20 23 1
## text55 45 47 1
## text56 12 13 1
## text57 47 63 1
## text58 37 42 1
## text59 21 23 1
## text60 41 53 1
## text61 15 20 1
## text62 30 37 1
## text63 38 55 1
## text64 18 21 1
## text65 35 47 1
## text66 39 47 1
## text67 38 42 1
## text68 38 43 1
## text69 12 12 1
## text70 36 41 1
## text71 21 27 1
## text72 16 16 1
## text73 17 18 1
## text74 67 81 1
## text75 13 14 1
## text76 17 22 1
## text77 16 16 1
## text78 27 27 1
## text79 56 79 1
## text80 30 33 1
## text81 14 14 1
## text82 32 39 1
## text83 42 42 1
## text84 30 34 1
## text85 28 37 1
## text86 15 15 1
## text87 59 74 1
## text88 25 30 1
## text89 14 14 1
## text90 16 16 1
## text91 12 12 1
## text92 25 29 1
## text93 29 34 1
## text94 33 36 1
## text95 48 76 1
## text96 35 44 1
## text97 25 26 1
## text98 40 55 1
## text99 36 47 1
## text100 25 25 1
dfm<- corpus %>%
tokens() %>%
tokens_remove(stopwords("french"))%>%
dfm()
dfm<- dfm %>%
dfm_trim(min_termfreq = 50, verbose = FALSE)
#a rule of thumb frequency must be at least2 times the density of 2 for a Thousand
#wordcloud
set.seed(100)
textplot_wordcloud(dfm)
dfm_df<- as.data.frame(dfm)
dim(dfm_df)
## [1] 16576 1659
head(dfm,8)
## Document-feature matrix of: 8 documents, 1,658 features (98.20% sparse) and 0 docvars.
## features
## docs février fille inscrire caf faire démarche temps heure contacter
## text1 1 1 1 2 1 1 1 1 1
## text2 0 0 0 1 1 0 0 0 0
## text3 0 0 0 0 0 0 1 0 1
## text4 0 0 0 0 0 0 0 0 0
## text5 0 0 0 0 1 0 0 0 0
## text6 0 0 0 1 2 0 0 0 0
## features
## docs téléphone
## text1 2
## text2 1
## text3 1
## text4 0
## text5 0
## text6 0
## [ reached max_ndoc ... 2 more documents, reached max_nfeat ... 1,648 more features ]
#counting words
dfm_count<-dfm_df %>%
pivot_longer(-doc_id,names_to = "word", values_to = "n")%>%
group_by(word)%>%
summarise(Frequency=sum(n))
Issue is that with wordcloud position is meaningless, just random and paving..
Goal : represent similarity between words based on their co-occurences in a small dimension space.
Come back to PCCA ! (we pass CA and MCA)
# with library(FactoMineR)
foo<- dfm_df %>%
column_to_rownames(var="doc_id")
#PCA
res.pca <- PCA(foo[,c(1:1658)], ncp=10,graph=FALSE) #could be pretty long
fviz_eig(res.pca, addlabels = TRUE, ylim = c(0, 10))
#label selection factor
a=.2
foo1<-as.data.frame(res.pca$var$coord) %>%
rownames_to_column(var="label") %>%
filter(Dim.1>a | Dim.1< -a | Dim.2>a | Dim.2< - a)
ggplot(foo1, aes(x=Dim.1, y=Dim.2))+
geom_text(aes(label=label),size=2)
foo1<-as.data.frame(res.pca$var$coord) %>%
rownames_to_column(var="label") %>%
filter(Dim.3> a | Dim.3< -a | Dim.4> a | Dim.4< -a)
ggplot(foo1, aes(x=Dim.1, y=Dim.2))+
geom_text(aes(label=label),size=2)
#typo with kmnes
foo<-as.data.frame(res.pca$var$coord)
resKM <- kmeans(foo, 20, nstart = 25, trace=0)
foo<-as.data.frame(resKM$cluster) %>%
rename(cluster=1)%>%
rownames_to_column(var="word") %>%
left_join(dfm_count)
foo$cluster<-as.factor(foo$cluster)
set.seed(42) #for reproducibility
#library(ggwordcloud)
ggplot(foo, aes(label = word, size=Frequency, group = cluster)) +
geom_text_wordcloud() +
scale_size_area(max_size = 5) +
facet_wrap(vars(cluster), ncol=4)+
theme_minimal()+
labs(title=NULL) #comment enlever la numérotation ?
ggsave("./Images/cluster.jpeg", width = 28, height = 20, units = "cm")
A legacy of MDS (Multidimensional scaling) due to Van der maaten. Source is
T-sne project large number of objects in a small dimension space as traditional MDS, but with a principle of density, that dilate space where point are dense, and contract it when is not.
https://cran.r-project.org/web/packages/Rtsne/Rtsne.pdf
The best alternative seems to be UMAP.
#we need to transpose the matrix
foo<-dfm_df %>%
select(-doc_id)%>%t()
#just to keep words aside
word<-as.data.frame(rownames(foo))%>%
rename(word=1)
Model and 2D tsne representation
set.seed(42) # Sets seed for reproducibility
tsne_out <- Rtsne(foo,
initial_dims = 50,
perplexity = 50,
partial_pca=TRUE,
theta=.5,
num_threads=4,
verbose=1)
## Performing PCA
## Read the 1658 x 50 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.30 seconds (sparsity = 0.164029)!
## Learning embedding...
## Iteration 50: error is 65.491716 (50 iterations in 0.19 seconds)
## Iteration 100: error is 65.491716 (50 iterations in 0.14 seconds)
## Iteration 150: error is 65.491716 (50 iterations in 0.15 seconds)
## Iteration 200: error is 65.491716 (50 iterations in 0.17 seconds)
## Iteration 250: error is 65.491716 (50 iterations in 0.20 seconds)
## Iteration 300: error is 2.040402 (50 iterations in 0.17 seconds)
## Iteration 350: error is 1.699851 (50 iterations in 0.12 seconds)
## Iteration 400: error is 1.650918 (50 iterations in 0.13 seconds)
## Iteration 450: error is 1.627877 (50 iterations in 0.11 seconds)
## Iteration 500: error is 1.616187 (50 iterations in 0.14 seconds)
## Iteration 550: error is 1.609548 (50 iterations in 0.11 seconds)
## Iteration 600: error is 1.606150 (50 iterations in 0.12 seconds)
## Iteration 650: error is 1.602474 (50 iterations in 0.11 seconds)
## Iteration 700: error is 1.600997 (50 iterations in 0.12 seconds)
## Iteration 750: error is 1.599377 (50 iterations in 0.11 seconds)
## Iteration 800: error is 1.597540 (50 iterations in 0.11 seconds)
## Iteration 850: error is 1.596278 (50 iterations in 0.11 seconds)
## Iteration 900: error is 1.595396 (50 iterations in 0.11 seconds)
## Iteration 950: error is 1.594218 (50 iterations in 0.11 seconds)
## Iteration 1000: error is 1.593394 (50 iterations in 0.11 seconds)
## Fitting performed in 2.67 seconds.
tsne_out1<-as.data.frame(tsne_out$Y)
tsne_out2<-cbind(word,tsne_out1)%>%
left_join(dfm_count)%>%
filter(Frequency<7000 & Frequency>150)
ggplot(tsne_out2, aes(x=V1, y=V2))+
geom_text_repel(aes(label=word,
size=log10(Frequency),
alpha=log10(Frequency)),
color="black",
max.overlap=Inf)+
theme(legend.position = "none")+
labs(x=NULL, y=NULL)+
scale_size(range = c(.1, 3.5))
ggsave("./Images/tsne.jpeg", width = 28, height = 20, units = "cm")
A quick and dirty quanteda function
textplot_network
It is based on igraph. that will be the end of the
lecture.
##coocurrence computing
tag_fcm <- fcm(dfm)
head(tag_fcm)
## Feature co-occurrence matrix of: 6 by 1,658 features.
## features
## features février fille inscrire caf faire démarche temps heure contacter
## février 92 27 27 180 544 111 108 45 109
## fille 0 339 52 128 618 180 172 95 81
## inscrire 0 0 100 52 449 128 91 56 65
## caf 0 0 0 1652 1669 342 396 166 406
## faire 0 0 0 0 5666 3362 2158 1502 1750
## démarche 0 0 0 0 0 897 627 407 498
## features
## features téléphone
## février 128
## fille 106
## inscrire 95
## caf 490
## faire 2447
## démarche 665
## [ reached max_nfeat ... 1,648 more features ]
#select top tags
toptag <- names(topfeatures(dfm, 100))
topgat_fcm <- fcm_select(tag_fcm, pattern = toptag) #select 500 more frequent words
textplot_network(topgat_fcm,
min_freq = 0.5,
edge_alpha = 0.01,
edge_size = 0.1,
vertex_labelsize = 2.5)
ggsave("./Images/network.jpeg", width = 28, height = 20, units = "cm")
Raw frequencies could be misleading as frequent word are not always distinctive. The idea is to weigth term frequency with document frequency, which is the numbre of document in which you could observe the terms .
\[TfIdf_{i}=Tf_{i}\ln(\frac{N}{n_{i}})\] where N is the Total number of document and \(n_{i}\) the number of document in which term i is present.
To change style of writing operation, we will use
tidytext. The code come from
foo<-UD %>%
filter(upos=="NOUN"|upos=="PROPN"|upos=="VERB"|upos=="ADJ")%>%
group_by(doc_id)%>%
count(doc_id, lemma, sort=TRUE)
total_words <- foo %>%
group_by(doc_id) %>%
summarize(total = sum(n))
Avis_words <- left_join(foo, total_words) %>%
mutate(Term_frequency=n/total)%>%
group_by(doc_id)%>%
summarise(word=lemma,rank = row_number(),
n=n,
Term_frequency=Term_frequency)
Avis_words
## # A tibble: 567,622 x 5
## # Groups: doc_id [16,576]
## doc_id word rank n Term_frequency
## <chr> <chr> <int> <int> <dbl>
## 1 doc1 avoir 1 2 0.0556
## 2 doc1 téléphone 2 2 0.0556
## 3 doc1 10mn 3 1 0.0278
## 4 doc1 1500euro 4 1 0.0278
## 5 doc1 accoucher 5 1 0.0278
## 6 doc1 aide 6 1 0.0278
## 7 doc1 allocation 7 1 0.0278
## 8 doc1 APL 8 1 0.0278
## 9 doc1 attente 9 1 0.0278
## 10 doc1 bête 10 1 0.0278
## # ... with 567,612 more rows
Just test on two documents to get a flavour of the story.
Distribution and zipf like plot.
Avis_words%>%
filter(doc_id=="doc12441"|doc_id=="doc12912") %>%
ggplot(aes(x=Term_frequency, fill = doc_id)) +
geom_histogram(show.legend = FALSE) +
# xlim(NA, 0.0009) +
facet_wrap(~doc_id, ncol = 2, scales = "free_y")
Avis_words%>%
filter(doc_id=="doc12441"|doc_id=="doc12912") %>%
ggplot(aes(x=rank,Term_frequency, color = doc_id)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
tfidf computing with tidytext function bind_tf_idf
Avis_tf_idf <- Avis_words %>%
bind_tf_idf(word, doc_id, n)
Avis_tf_idf %>%
select(-rank,n,Term_frequency) %>%
filter(tf_idf<0.5)%>%
arrange(desc(tf_idf))
## # A tibble: 560,204 x 7
## # Groups: doc_id [16,528]
## doc_id word n Term_frequency tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 doc12268 devenir 1 0.125 0.125 4.00 0.500
## 2 doc8600 pension 1 0.125 0.125 4.00 0.500
## 3 doc13167 Iran 3 0.06 0.06 8.33 0.500
## 4 doc10978 vingtaine 1 0.0769 0.0769 6.50 0.500
## 5 doc12466 précarité 1 0.0769 0.0769 6.50 0.500
## 6 doc14037 essaye 1 0.0769 0.0769 6.50 0.500
## 7 doc14048 covidre 1 0.0769 0.0769 6.50 0.500
## 8 doc2286 armer 1 0.0769 0.0769 6.50 0.500
## 9 doc555 réinscrire 2 0.0769 0.0769 6.50 0.500
## 10 doc6155 Séville 3 0.0769 0.0769 6.50 0.500
## # ... with 560,194 more rows
Avis_tf_idf%>%
filter(doc_id=="doc12441"|doc_id=="doc12912") %>%
group_by(doc_id) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = doc_id)) +
geom_col(show.legend = FALSE) +
facet_wrap(~doc_id, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
Avis_tf_idf%>%
ggplot(aes(tf_idf))+
geom_histogram(binwidth = 0.002)+
xlim(0,1)
Reuse of tsne procedure. But first compute distance
foo<-Avis_tf_idf %>%
left_join(dfm_count) %>%
filter(Frequency>50)%>%
select(doc_id, word, tf_idf) %>%
pivot_wider(doc_id, names_from = "word", values_from = "tf_idf")%>%
column_to_rownames(var="doc_id") %>%
mutate(
across(everything(), ~replace_na(.x, 0))
)
#just to keep words aside
foo<-t(foo)
word<-as.data.frame(rownames(foo))%>%
rename(word=1)
then apply the model
set.seed(42) # Sets seed for reproducibility
tsne_out <- Rtsne(foo,
initial_dims = 50,
perplexity = 50,
partial_pca=TRUE,
theta=.5,
num_threads=4,
verbose=1,
check_duplicates = FALSE)
## Performing PCA
## Read the 1609 x 50 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.25 seconds (sparsity = 0.172802)!
## Learning embedding...
## Iteration 50: error is 64.695991 (50 iterations in 0.20 seconds)
## Iteration 100: error is 64.696010 (50 iterations in 0.19 seconds)
## Iteration 150: error is 64.696014 (50 iterations in 0.21 seconds)
## Iteration 200: error is 64.696001 (50 iterations in 0.24 seconds)
## Iteration 250: error is 64.696010 (50 iterations in 0.29 seconds)
## Iteration 300: error is 2.906426 (50 iterations in 0.23 seconds)
## Iteration 350: error is 1.785382 (50 iterations in 0.13 seconds)
## Iteration 400: error is 1.724227 (50 iterations in 0.12 seconds)
## Iteration 450: error is 1.691735 (50 iterations in 0.11 seconds)
## Iteration 500: error is 1.677824 (50 iterations in 0.12 seconds)
## Iteration 550: error is 1.666957 (50 iterations in 0.12 seconds)
## Iteration 600: error is 1.658279 (50 iterations in 0.12 seconds)
## Iteration 650: error is 1.652063 (50 iterations in 0.12 seconds)
## Iteration 700: error is 1.648806 (50 iterations in 0.12 seconds)
## Iteration 750: error is 1.645790 (50 iterations in 0.12 seconds)
## Iteration 800: error is 1.643270 (50 iterations in 0.12 seconds)
## Iteration 850: error is 1.642125 (50 iterations in 0.11 seconds)
## Iteration 900: error is 1.640837 (50 iterations in 0.11 seconds)
## Iteration 950: error is 1.639355 (50 iterations in 0.11 seconds)
## Iteration 1000: error is 1.638975 (50 iterations in 0.11 seconds)
## Fitting performed in 3.00 seconds.
tsne_out1<-tsne_out$Y
tsne_out2<-as.data.frame(cbind(word,tsne_out1) )%>%
left_join(dfm_count)%>%
filter(Frequency<7000 & Frequency>100)
ggplot(tsne_out2, aes(x=`1`, y=`2` ))+
geom_text_repel(aes(label=word,
size=log10(Frequency),
alpha=log10(Frequency)),
color="black",
max.overlap=Inf)+
theme(legend.position = "none")+
labs(x=NULL, y=NULL)+
scale_size(range = c(.1, 3))
ggsave("./Images/tsne2.jpeg", width = 28, height = 20, units = "cm")
Playing with Keyness, a statistics fo find discriminant words across group of text. https://quanteda.io/reference/textstat_keyness.html
( from NPS game)
# 1 corpus definition
df_work<-readRDS("./Data/df_work.rds")
corpus<-corpus(df_work,text_field ="description")
# 2 tokenisation (per group)
toks <- tokens(corpus, remove_punct = TRUE) %>%
tokens_remove(pattern = stopwords("fr"))%>%
tokens_remove(pattern="très*") %>%
tokens_group(groups = ressenti)
# 3 dfm building
dfm <- dfm(toks) %>%
dfm_trim(min_termfreq = 40, verbose = FALSE)
# 4 afficher le wordcloud
textplot_wordcloud(dfm,comparison = TRUE, color = col)
Keyness for positive and negative.
A clear answer !
# Create a dfm per group
dfm <-toks %>%
tokens_group(groups = ressenti) %>%
dfm()
# Calculate keyness and determine "Positif" as target group againts all other categories
result_keyness <- textstat_keyness(dfm, target = "Positif") %>%
filter (n_target>20)
# Plot estimated word keyness
g1<-textplot_keyness(result_keyness, n = 20L, labelsize = 3, show_legend = FALSE,
show_reference = TRUE, color = c("Darkgreen", "gray"))+
labs(x=NULL)
# Calculate keyness and determine "Négatif" as target group againts all other categories
result_keyness <- textstat_keyness(dfm, target = "Négatif") %>%
filter (n_target>20)
# Plot estimated word keyness
g2<-textplot_keyness(result_keyness, n = 20L, labelsize = 3, show_legend = FALSE,
show_reference = TRUE, color = c("firebrick", "gray"))+
labs(x=NULL)
plot_grid(
g1, g2,
labels = "AUTO"
)
ggsave("./Images/Keyness.jpeg", width = 28, height = 20, units = "cm")
a complete graph with igraph
Beware to computing time! Best to sample for testing. (come back to the beginning)
t2=Sys.time()
t<- t2-t1
print(t)
## Time difference of 10.04649 mins
See you to later and go to session 3
Some exercises before, for training :