Tools

knitr::opts_chunk$set(echo = TRUE, include=TRUE, message=FALSE, warning=FALSE)
#from session 1 & 2
library(tidyverse)
library(udpipe)
library(flextable)
library(cowplot)
library(quanteda)
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(syuzhet)  #analyse du sentimeent

# new for session 3
library(FactoMineR)
library(factoextra)
library(igraph)
library(ggwordcloud)
library(ggrepel)

library(Rtsne)
library(tidytext)

#new for session 4
library(word2vec)
library(ape)

theme_set(theme_minimal()+theme(plot.title = element_text(size=12)))

t1=Sys.time()

Building embeddings

The idea came from information retrieval with the work of Sutton for example. But the great idea came from Mikolov et al. (2013).

A few theory

Main ideas : encode a word with vector of large dimension. As the meaning of word come from their cooccurent usage () within a certain window of words, vectors of words will be similar in the space.

Word as vector

Two algorithms :

Skip-gram: works well with a small amount of the training data, represents well even rare words or phrases.
CBOW: several times faster to train than the skip-gram, slightly better accuracy for the frequent words.

Algorithms ## Application

we use here the package word2vec, but some other solutions are offered with WordtoVec ( berndt schmidt) or glove from quanteda

UD<-readRDS("./data/UD.rds")

#on filtre adverbes adjectifs verb et non communs
updated_vocab <- UD %>%  
  filter(upos %in% c('NOUN', 'PROPN', "ADJ","VERB")) %>% 
  mutate(lemma=tolower(lemma))

updated_vocab2<- updated_vocab %>% 
  rename(word=lemma)%>%
  group_by(word)%>% 
  summarise(n=n())

#on reconstitue le texte filtré
text2<-updated_vocab %>%
 group_by(doc_id) %>%
 summarise(description = paste(lemma, collapse = " "))


#on vectorise
set.seed(123456789)
model <- word2vec(x = text2$description, 
                  type = "cbow", 
                  window = 7, 
                  dim = 200, 
                  iter = 100,
                  verbose=10,
                    threads = 4L
                  )
embedding <- as.matrix(model)

#test sur reviews

lookslike <- predict(model, c("impot"), type = "nearest", top_n = 20)
foo<-lookslike$impot
g1<-ggplot(foo, aes(x=reorder(term2,similarity),y=similarity))+
  geom_point(col="black",size=3)+
  coord_flip()+
  ggtitle("Impot")
g1

wv <- predict(model, newdata = c("telephone", "internet", "contact"), type = "embedding")
wv <- wv["telephone", ] - wv["internet", ] + wv["contact", ]

predict(model, newdata = wv, type = "nearest", top_n = 10)

##         term similarity rank
## 1  telephone  0.9619653    1
## 2    contact  0.9515743    2
## 3  téléphone  0.8195363    3
## 4  contacter  0.6805366    4
## 5      appel  0.6747885    5
## 6    reponse  0.6428372    6
## 7   relation  0.6320581    7
## 8     amical  0.6299692    8
## 9   répondre  0.6264025    9
## 10   numéros  0.6244956   10

Clustering

#on typologise des termes

library(fastcluster) #pour aller plus vite
distance<-as.dist(1 - cor(t(embedding)))
arbre <- hclust(distance, method = "ward.D2")
plot(arbre,  xlab = "", ylab = "", sub = "", axes = FALSE, hang = -1)
rect.hclust(arbre,16, border = "green3")

group<- as.data.frame(cutree(arbre, k = 16))

group<- group %>% 
  rownames_to_column(var="word")%>%
  rename(group=2)%>%
  left_join(updated_vocab2, by="word") 

foo<- group %>%
  filter(n>100 & n<7000)

library(ggwordcloud)
ggplot(group, aes(label = word, size = n, color=n)) +
  geom_text_wordcloud_area() +
  scale_size_area(max_size = 10) +
  facet_wrap(vars(group), ncol=4)

ggsave("./Images/embedding2.jpeg", width = 28, height = 20, units = "cm")

Encore un TSNE

set.seed(57)
rtsne_out <- Rtsne(distance,
                  initial_dims = 50,
                  perplexity = 50,
                  partial_pca=TRUE,
                  theta=.5,
                  num_threads=4, 
                  verbose=1,
                  check_duplicates = FALSE)

## Read the 6737 x 6737 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 21.82 seconds (sparsity = 0.025241)!
## Learning embedding...
## Iteration 50: error is 87.553722 (50 iterations in 0.81 seconds)
## Iteration 100: error is 87.553722 (50 iterations in 1.25 seconds)
## Iteration 150: error is 87.553722 (50 iterations in 1.73 seconds)
## Iteration 200: error is 87.553722 (50 iterations in 2.10 seconds)
## Iteration 250: error is 87.553722 (50 iterations in 2.50 seconds)
## Iteration 300: error is 4.811237 (50 iterations in 2.60 seconds)
## Iteration 350: error is 4.811237 (50 iterations in 2.21 seconds)
## Iteration 400: error is 4.811237 (50 iterations in 1.47 seconds)
## Iteration 450: error is 4.245178 (50 iterations in 1.13 seconds)
## Iteration 500: error is 4.109718 (50 iterations in 0.55 seconds)
## Iteration 550: error is 4.066568 (50 iterations in 0.54 seconds)
## Iteration 600: error is 4.032320 (50 iterations in 0.54 seconds)
## Iteration 650: error is 4.005393 (50 iterations in 0.67 seconds)
## Iteration 700: error is 3.985965 (50 iterations in 0.55 seconds)
## Iteration 750: error is 3.968757 (50 iterations in 0.55 seconds)
## Iteration 800: error is 3.953762 (50 iterations in 0.58 seconds)
## Iteration 850: error is 3.944238 (50 iterations in 0.62 seconds)
## Iteration 900: error is 3.936891 (50 iterations in 0.60 seconds)
## Iteration 950: error is 3.930001 (50 iterations in 0.56 seconds)
## Iteration 1000: error is 3.924443 (50 iterations in 0.58 seconds)
## Fitting performed in 22.12 seconds.

color.vec = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58", "seagreen1", "seagreen4", "slateblue4", "firebrick", "Royalblue",
              "purple","Orange","Cyan","Coral","Gold","Chartreuse")

#des manip pour associer les groupe du clustering aux termes et à la leur coordonnée dans tsne.
tsne_out1<-as.data.frame(rtsne_out$Y) %>%rename(D1=V1,D2=V2)
tsne_out2<-as.data.frame(cbind(tsne_out1,embedding))%>%
  rownames_to_column(var="word")%>%
  left_join(group)%>%
  filter(n<7000 & n>200)

ggplot(tsne_out2, aes(x=`1`, y=`2` ))+
  geom_text_repel(aes(label=word, 
                      size=log10(n),
                      alpha=log10(n),
                  color=as.factor(group),
                  max.overlap=Inf))+
  theme(legend.position = "none")+
  labs(x=NULL, y=NULL)+  
  scale_size(range = c(.1, 3))

ggsave("./Images/embedding3.jpeg", width = 28, height = 20, units = "cm")

Text as vectors

Text vectorisation

A text vector is the sum of word vectors.Document vectors are the sum of the vectors of the words which are part of the document standardised by the scale of the vector space. This scale is the sqrt of the average inner product of the vector elements

https://rdrr.io/cran/word2vec/man/doc2vec.html

#titre<-UD %>%group_by
x      <- data.frame(doc_id           = text2$doc_id, 
                     text             = text2$description, 
                     stringsAsFactors = FALSE)
x$text <- txt_clean_word2vec(x$text, tolower=TRUE)
emb1 <- doc2vec(model, x$text,  split = " ",type = "embedding")

comparison with a concept vector

## vectorisation des textes

newdoc <- doc2vec(model, "injustice arbitraire inégalité iniquité partialité équité impartialité justice légitimité", type = "embedding")

foo<-word2vec_similarity(emb1, newdoc)
emb2<-cbind(foo,x)%>%
  as.data.frame() %>% arrange(desc(foo))
head(emb2,10)

##          foo   doc_id
## 1  0.5767817 doc15642
## 2  0.5272438  doc3902
## 3  0.5145822  doc7148
## 4  0.5077388 doc12685
## 5  0.5074009  doc2015
## 6  0.5072624 doc13110
## 7  0.5051428 doc16194
## 8  0.4987474  doc4962
## 9  0.4928515 doc15461
## 10 0.4874359  doc1075
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  text
## 1                                                                                                                                                                                                                                   accuser deposer carton rue paris france moment fait contester amende reception billet avion appui prouver pouvoir cas carton avoir nom inscrire avoir retour contestation peur amende majorer maniere arbitraire habitude pays venir payer ressentir grand sentiment injustice donner envre civique ramasser crotte chien nuit regarder histoire justifier 170euro venir debourser aller creer gens force traiter
## 2  consulat geneve suisse consulat enclave souverain pays representer sein autre nation niveau securisation appliquer niveau entree consulat preciter deplacer vu esprit suisse situation securitaire terre helvetique dispositif vigile amene exister lieu situation menace existant creer situation inapproprie deplacer croire vouloir devoir acceder palais justice paris periode proces disciple moderne jacques mesrine exces securitisme outrance laisser percevoir securite crainte inconfort suspicion element renforcer cote desagreable vigile glacant marbre accueillir citoyen franco helvetique rendre regler affaire etre geneve paris
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                   comprendre agent pole emploi subir fort pression devoir nombre chomeur affecter agent supporter payer frais ton violent rabaisser rester soumi agace enerve avoir marre colle etiquette erroner personne decoller
## 4                                              trouver discriminatoire inadmissible loi voter respecter explication souhaiter beneficier prime energie remplacer vieux chaudiere fuel pompe chaleur air eau presenter gratuit bon nombre revue confronter refus entreprise local preciser quatre circulaire jour octobre refu contrainte revenu seul objection l age emaner service etat fait bafouer propre loi avoir an rester droit voter taire attendre reponse faculte intellectuelle temps disponible obtenir don puisque possibilite offrir respecter condition soustraire an ne enterrer procuration date maison renover isoler euro resister
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       aide financier securite social reactivite maximal ponctualite
## 6                                                                                                                   regler question heritage falloir contacter tribunal instance ville dependre depart allemagne information fournir service tribunal instance maladroit imprecis presentation juge tribunal laisser sentiment ecoeurement degout voir vis justice francais vivre allemagne ignor mecanisme fonctionnement tribunal falloir saisir instance proximite accueil recevoir part juge arrogant cooperatif entretien tribunal comprendre vivre etranger depouiller droit citoyen francais frais timbre perdre partie heritage revenir droit
## 7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      plein desarroi decider exprimer tchat bouteille mer adjudant chef recueillir repondre confidence merveilleux ecoute bienveillant professionnalisme humour aide
## 8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               service consulaire sembler traiter facon hautain sentiment abandonner
## 9                                               supposer inobservation conducteur l arret absolire imposer panneau stop possibilite debattre preuve infraction voila ressenti courrier recevoir tribunal police reponse tribunal police satisfaire fait indiquer preuve contraire proces verbal faire foi avoir possibilite debattre preuve moment infraction supposer presomption innocence cote preuve suppose infraction sur courrier tribunal police cas suite long peine financier amener 750euro cas combat pot fer pot terre issue pot terre defavorable sage payer ensuite loi valoir reconnaissance suppose infraction contestation possible
## 10                                                                                                                                                                                                                                                       impot inventer ivaller monsieur recherche faire agir ex voisin pallier erreur numero appartement penser consequence invraisemblable taxe habitation refus cheque energie impossible entendre raison impot part numero tel tout inoperant mettre colere an information col doigt pansement impossible defaire monde sous entendre men pays citoyen honnete payer monde entree coupable fraude

Working in a common space

Words and texts in the common space of vectors reduced to a 2D projection.

vector_tot<-rbind(embedding, emb1)%>%
  as.data.frame()%>%
  drop_na()

tsne_out <- Rtsne(vector_tot,initial_dims = 50,
                  perplexity = 50,
                  partial_pca=TRUE,
                  theta=.5,
                  num_threads=4, 
                  verbose=1,
                  check_duplicates = FALSE) # Run TSNE

## Performing PCA
## Read the 23304 x 50 data matrix successfully!
## OpenMP is working. 4 threads.
## Using no_dims = 2, perplexity = 50.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
##  - point 10000 of 23304
##  - point 20000 of 23304
## Done in 39.89 seconds (sparsity = 0.009603)!
## Learning embedding...
## Iteration 50: error is 100.309378 (50 iterations in 2.88 seconds)
## Iteration 100: error is 100.309365 (50 iterations in 3.31 seconds)
## Iteration 150: error is 98.315164 (50 iterations in 3.36 seconds)
## Iteration 200: error is 96.480550 (50 iterations in 2.87 seconds)
## Iteration 250: error is 96.489932 (50 iterations in 2.78 seconds)
## Iteration 300: error is 4.165927 (50 iterations in 2.68 seconds)
## Iteration 350: error is 3.931285 (50 iterations in 2.41 seconds)
## Iteration 400: error is 3.802034 (50 iterations in 2.37 seconds)
## Iteration 450: error is 3.716695 (50 iterations in 2.46 seconds)
## Iteration 500: error is 3.653879 (50 iterations in 2.44 seconds)
## Iteration 550: error is 3.605808 (50 iterations in 2.45 seconds)
## Iteration 600: error is 3.567366 (50 iterations in 2.39 seconds)
## Iteration 650: error is 3.535497 (50 iterations in 2.39 seconds)
## Iteration 700: error is 3.508243 (50 iterations in 2.36 seconds)
## Iteration 750: error is 3.484360 (50 iterations in 2.41 seconds)
## Iteration 800: error is 3.464623 (50 iterations in 2.40 seconds)
## Iteration 850: error is 3.448460 (50 iterations in 2.46 seconds)
## Iteration 900: error is 3.434239 (50 iterations in 2.50 seconds)
## Iteration 950: error is 3.421134 (50 iterations in 2.60 seconds)
## Iteration 1000: error is 3.409174 (50 iterations in 2.52 seconds)
## Fitting performed in 52.03 seconds.

tsne_out2<-as.data.frame(tsne_out$Y)
tsne_out2[1:6737,3]<-"word"
tsne_out2[6738:23304,3]<-"doc"

à recoriger

w<-as.data.frame(rownames(emb1))%>% rename(tag=1) d<-as.data.frame(rownames(embedding))%>%rename(tag=1) x<-rbind(d,w) tsne_out3<-cbind(tsne_out2,x) # tsne_out3<-cbind(tsne_out2) %>% left_join(updated_vocab2) %>% filter(n>4) library(ggrepel) tsne_out3%>% ggplot(aes(x=V1, y=V2, label=tag, group=V3))+ geom_text_repel(aes(label=tag,color=V3),max.overlaps=50, size=2)+ labs(title=““, subtitle=”“, x= NULL, y=NULL)+ theme(legend.position =”none”)

Conclusion

Some alternatives : Gloves, fastext

other package text2vec doc2vec berndt schmidts

Other applications

ML : as featuring

Some great applications : Gennaro and Ash (2022) ou Hamilton, Leskovec, and Jurafsky (2018)

The future came from Transformers (that’s the last session)

Gennaro, Gloria, and Elliott Ash. 2022. “Emotion and Reason in Political Language.” The Economic Journal 132 (643): 1037–59. https://doi.org/10.1093/ej/ueab104.

Hamilton, William L., Jure Leskovec, and Dan Jurafsky. 2018. “Diachronic Word Embeddings Reveal Statistical Laws of Semantic Change.” arXiv. http://arxiv.org/abs/1605.09096.

Mikolov, Tomas, Kai Chen, Greg Corrado, and Jeffrey Dean. 2013. “Efficient Estimation of Word Representations in Vector Space.” arXiv. http://arxiv.org/abs/1301.3781.

Session 4 : Embeddings

cb,jcrm,bc,oc

2022-11-03