Tools

Mainly Quanteda and udpipe

knitr::opts_chunk$set(echo = TRUE, include=TRUE, message=FALSE, warning=FALSE)
library(tidyverse) # a swiss kniffe
library(udpipe) # syntaxis and lexical annotations
library(flextable) #for table rendering
library(ggwordcloud) #for ploting
library(cowplot) #for ploting
library(quanteda) #a complete text mining package
library(quanteda.textmodels)
library(quanteda.textstats)
library(quanteda.textplots)
library(quanteda.dictionaries)
library(syuzhet)  #analyse du sentimeent
library(lubridate) #date processing
library(udpipe) #annotations et tokenization
library(igraph) #network representations
library(FactoMineR)
library(factoextra)
theme_set(theme_minimal())

t1=Sys.time()

Dataset : a first view

User Experience project from DITP Services public+

Comments are relative to a life experience (renew a passport) at a certain time, with a specific administration in a certain place.

Sentiment distribution

The score has been compute with a transformer model, we will examine it further at session 6. It is a probability to be positive.

#read the file and sample to reduce computation ( look at the end)
#we put the content of the csv file in a dataframe and select only certain columns
df <- read_csv("./Data/data_original_12062022_normalise_sentiment_transformers_lemna_words_adj.csv")%>%
  select (id, date_ecrit, titre, description, ressenti, pays,intitule, canaux_typologie_1, transformer_sentiment_score) 

# we treat the data
# the mutate function enables us to create or modify a column 
# here, we manipulate data for correcting default of the text.
df<-df %>% 
  mutate(description=str_replace_all(description,"\\.(?=[A-Za-z])", ". "), #space after dot
         description=str_replace(description,"_", " "), #remove underscore
         description=str_replace(description,"(\u20AC)", "euro") )#change euro in euro


#%>% sample_n(1000) to avoid too long intermediary computing.

head(df, 5) #we display the first 5 rows of the data frame

## # A tibble: 5 x 9
##       id date_ecrit titre          descr~1 resse~2 pays  intit~3 canau~4 trans~5
##    <dbl> <date>     <chr>          <chr>   <chr>   <chr> <chr>   <chr>     <dbl>
## 1 280177 2019-03-22 Suivi de doss~ J'ai a~ Négatif Fran~ CAF     Téléph~  0.373 
## 2 280174 2019-03-22 Délai de répo~ J'ai e~ Négatif Fran~ CAF     Compte~  0.0579
## 3 280171 2019-03-26 RSA et Prime ~ J’ai f~ Négatif Fran~ CAF     Démarc~  0.484 
## 4 280169 2019-03-27 Nouvel alloca~ J'ai d~ Neutre  Fran~ <NA>    <NA>     0.116 
## 5 280166 2019-03-28 CAF : Un manq~ J'habi~ Négatif Fran~ CAF     Démarc~  0.295 
## # ... with abbreviated variable names 1: description, 2: ressenti, 3: intitule,
## #   4: canaux_typologie_1, 5: transformer_sentiment_score

We plot the transformer sentiment distribution, as a density and as a cumulative distribution. The plot_grid() function provides a simple interface for arranging plots into a grid and adding labels to them, it is offered by cowplot.

g1<-ggplot(df,aes(x=transformer_sentiment_score))+
  geom_density(fill="pink", alpha=.5)

g2<-ggplot(df, aes(x=transformer_sentiment_score)) +
  stat_ecdf(geom = "step",pad = FALSE)+
  xlim(0,1)

#cowplot

plot_grid(
  g1, g2,
  labels = "AUTO"
  )

ggsave("./Images/sentiment1.jpeg", width = 28, height = 20, units = "cm")

Compare with ” smiles”

#score et "ressenti"

g2<-ggplot(df, aes(y=transformer_sentiment_score, x=ressenti)) +
  geom_violin(fill="pink", alpha=.5)+
  ylim(0,1)+
  geom_smooth()+
  labs(x=NULL, y = "Score transformer")

g2

ggsave("./Images/Sentiment2.jpeg", width = 28, height = 20, units = "cm")

channels involved

Each experience is relative to different channels. As this variable is encoded as a list of multiple choice, we need to rebuild a table with with as many columns as the modalities of the channel variable.

## multiple choice question recoding
typo<-as.data.frame(str_split_fixed(df$canaux_typologie_1, ",", 10))%>%
  mutate(across(where(is.character), str_trim)) %>% 
  cbind(df$id) %>%
  rename(id=11) %>%
           pivot_longer(-id, names_to = "valeur", values_to="variable") %>%
  filter(variable!="") %>% 
  group_by(id, variable) %>% 
  summarise(valeur=1) %>%
  pivot_wider(id,names_from="variable", values_from="valeur")%>%
  replace(is.na(.), 0)
head(typo)

## # A tibble: 6 x 12
## # Groups:   id [6]
##       id Démar~1 Télép~2 E-mai~3 Accueil Compt~4 Courr~5 Tchat~6 Patro~7 Résea~8
##    <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 268971       1       1       0       0       0       0       0       0       0
## 2 268976       0       1       1       0       0       0       0       0       0
## 3 268983       0       0       0       1       0       0       0       0       0
## 4 269003       0       1       0       0       0       0       0       0       0
## 5 269006       0       0       0       1       0       0       0       0       0
## 6 269007       1       1       0       0       1       0       0       0       0
## # ... with 2 more variables: `Visio/Video` <dbl>, `,` <dbl>, and abbreviated
## #   variable names 1: `Démarche en ligne`, 2: Téléphone, 3: `E-mail`,
## #   4: `Compte ou espace personnel`, 5: Courrier,
## #   6: `Tchat (messagerie instantanée)`, 7: Patrouille, 8: `Réseaux Sociaux`

#count the number of rows
n_c<-nrow(typo)
n_c

## [1] 15839

More data processing for the ggplot.

foo<-typo %>% 
  pivot_longer(-id, names_to = "variable", values_to="value") %>% 
  group_by(variable)%>%
  summarise(Penetration=mean(value)) %>%filter(variable!=",")

ggplot(foo, aes(x=reorder(variable, Penetration),y=Penetration))+
  geom_bar(stat="identity",fill="firebrick")+
  coord_flip()+
  labs(x=NULL, y="taux de pénétration")

ggsave("./Images/channel.jpeg", width = 28, height = 20, units = "cm")

A more deeper exploration

#cooccurence matrix
foo1 <- as.data.frame(typo)%>%
  select(2:11)%>%
  as.matrix()
out <- crossprod(foo1)  # Same as: t(X) %*% X
#diag(out) <- 0       # (b/c you don't count co-occurrences of an aspect with itself)

out<- as.data.frame(out) %>%
  rownames_to_column(var="Channels")%>% 
  pivot_longer(-Channels,names_to = "Channelsb", values_to = "frequency")

ggplot(data = out, aes(x=Channels, y=Channelsb, fill=log10(frequency))) + 
  geom_tile()+
  scale_fill_gradient(low = "Gold", high = "Firebrick", name="Frequency")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  labs(x=NULL, y=NULL)

# compare to a theoretical distribution
Y<-foo %>%
  column_to_rownames(var="variable")%>%
  as.matrix()

n=nrow(typo)
Z<-Y%*%t(Y) %>%as.data.frame()%>%  
  rownames_to_column(var="Channels")%>% 
  pivot_longer(-Channels,names_to = "Channelsb", values_to = "frequencyT")%>%
  mutate(frequencyT=frequencyT*n)

w<- merge(out,Z) %>%
  mutate(chi2=(frequency-frequencyT)^2/frequencyT)%>%
  mutate(chi2=ifelse(Channels==Channelsb,0,chi2))

ggplot(data = w, aes(x=Channels, y=Channelsb, fill=chi2)) + 
  geom_tile()+
  scale_fill_gradient(low = "Gold", high = "Firebrick", name="Chi2")+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
  labs(x=NULL, y=NULL)

Administrations involved

Each experience concern one administrations. He is the list of most populuous administration.

#typo
foo<-typo%>%
  select(-id)

#service
t<-as.data.frame(table(df$intitule)) %>%
  filter(Freq>50)

ggplot(t, aes(x=reorder(Var1, Freq),y=Freq))+
  geom_bar(stat="identity",fill="firebrick")+
  coord_flip()+
  labs(x=NULL, y="n")

ggsave("./Images/channel_freq.jpeg", width = 28, height = 20, units = "cm")

Sentiment score comparison

foo<-df%>%mutate(n=1)%>%
  group_by(intitule)%>%
  summarise(score=mean(transformer_sentiment_score,na.rm=TRUE),n=sum(n)) %>% 
  filter(!is.na(intitule))%>%
  filter(n>50)

ggplot(foo, aes(x=reorder(intitule,score), y=score ))+  
  geom_bar(stat="identity",aes(fill=desc(n)))+
  coord_flip()

ggsave("./Images/channel_sent.jpeg", width = 28, height = 20, units = "cm")

foo<-merge(df,typo) %>%
  select(7,11:19,5) %>% 
  mutate(across(where(is.numeric), as.character))

res.mca <- MCA(foo,quali.sup=c(1, 11))

fviz_mca_var(res.mca, 
             repel = TRUE, # Avoid text overlapping (slow)
             ggtheme = theme_minimal(),
             labelsize = 2)

ggsave("./Images/channel_MCA.jpeg", width = 28, height = 20, units = "cm")

Text level analysis

Stylometry are the (old) set of technics that process the text as a whole and give information on a particular feature of style.

counting

the number of words per review.

df$n_words<-str_count(df$description)
ggplot(df,aes(x=n_words))+
  geom_density()+
  scale_x_log10()

ggsave("./Images/count.jpeg", width = 28, height = 20, units = "cm")

Evolution of the number of reviews, size of reviews, and correlation between.

foo<-df %>%
  group_by(date_ecrit)%>%
  summarise(n=n(),
            size_t=mean(n_words))

ggplot(foo, aes(x=date_ecrit, y=n))+
  geom_line()+
  scale_y_log10()+
  geom_smooth()

ggplot(foo, aes(x=date_ecrit, y=size_t))+
  geom_point()+
  scale_y_log10()+
  geom_smooth()

ggplot(foo, aes(x=n, y=size_t))+
  geom_point()+
  scale_x_log10()+  scale_y_log10()+
  geom_smooth()

Readability

On agrège sur le mois, avec des fonctions lubridate

#la fonction de calcul de lisibilité
readability<-textstat_readability(df$description, 
                                  measure = c("Flesch",
                                              "meanSentenceLength",
                                              "meanWordSyllables")) 

foo<-cbind(df[,2],readability[,2:4])
foo$date<-as.POSIXct(foo$date)

foo1<-foo %>% 
  dplyr::mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))%>%
  select(-Month, -date_ecrit,-Year)%>%
  group_by(date) %>%
  summarise(Flesch=mean(Flesch, na.rm=TRUE), 
            SentenceLength= mean(meanSentenceLength, na.rm=TRUE),
            WordSyllables= mean(meanWordSyllables, na.rm=TRUE))

## Warning: 1 failed to parse.

foo2<-foo1 %>%
  pivot_longer(-date,names_to="Variable", values_to="Score")%>%
  drop_na()

ggplot(foo2,aes(x=date, y=Score, group=Variable))+
  geom_line(size=1.2, aes(color=Variable), stat="identity")+
  facet_wrap(vars(Variable), scale="free", ncol=1)+
  labs(title = "Experience readability", x=NULL, y=NULL)

Lexical diversity

lexdiv<-tokens(df$description)%>%
  textstat_lexdiv(df$text, measure = c("CTTR", "Maas"),  log.base = 10,
                  remove_numbers = TRUE,  
                  remove_punct = TRUE,  
                  remove_symbols = TRUE,
                  remove_hyphens = TRUE) 

foo<-cbind(df,lexdiv[,2:5])
foo1<-foo %>% mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))%>%
  group_by(date) %>%
  summarise(CTTR=mean(CTTR, na.rm=TRUE), 
            Maas= mean(Maas, na.rm=TRUE)) %>%
  pivot_longer(-date,names_to="Variable", values_to="Score")

ggplot(foo1,aes(x=date, y=Score, group=Variable))+
  geom_line(size=1.2, aes(color=Variable), stat="identity")+
  facet_wrap(vars(Variable), scale="free", ncol=1)+
  labs(title = "Lexical diversity", x=NULL, y=NULL)

foo1<-foo %>% 
  mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))%>%
  group_by(date) %>%
  summarise(CTTR=mean(CTTR, na.rm=TRUE), 
            Maas= mean(Maas, na.rm=TRUE))

cor(foo1$CTTR, foo1$Maas)

## [1] 0.5988366

ggplot(foo1,aes(x=CTTR, y=Maas))+
  geom_point(size=1.2, aes(color=date), stat="identity")+
  labs(title = "Lexical diversity", x=NULL, y=NULL)+geom_smooth(method="lm")

Sentiment analysis

a very simple function.

#library(syuzhet) analyse du sentimeent

#paramétres
method <- "nrc"
lang <- "french"
phrase<-as.character(paste0(df$titre,". ",df$description))

#extraction
emotions <- get_nrc_sentiment(phrase,language = "french")

Now we plot the evolution

emotion<-emotions[,1:8]
polarity<-subset(emotions,select=c(positive, negative))
foo<-cbind(df,polarity)%>%
mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))

#mean per 
foo1<-foo %>% 
  mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))%>%
  mutate(positive=positive/n_words, 
         negative=negative/n_words)%>%
  group_by(date) %>%
  summarise(positive=mean(positive, na.rm=TRUE), 
            negative= -mean(negative, na.rm=TRUE),
            valence=positive+negative,
            expressivity=positive-negative) %>%
  pivot_longer(-date,names_to="Variable", values_to="Score")

ggplot(foo1,aes(x=date, y=Score, group=Variable))+
  geom_line(size=1.2, aes(color=Variable), stat="identity")+
  labs(title = "Sentiment", x=NULL, y=NULL)+
  scale_colour_manual(values=c("Orange"," Red", "Darkgreen","Grey"))

Dictionnary methods : LIWC

#library("quanteda.dictionaries")
dict_liwc_french <- dictionary(file = "FrenchLIWCDictionary.dic",
                             format = "LIWC")
test<-liwcalike(df$description,dictionary = dict_liwc_french) %>%
  select(je,vous, il, ils, pronomimp)
foo<-cbind(df,test)

foo1<-foo %>% 
  mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))%>%
  group_by(date) %>%
  summarise(je=mean(je, na.rm=TRUE), 
            vous= mean(vous, na.rm=TRUE),
            il_s=mean(il+ils,na.rm=TRUE),
            pronomimp=mean(pronomimp,na.rm=TRUE)) %>%
  pivot_longer(-date,names_to="Variable", values_to="Score")

ggplot(foo1,aes(x=date, y=Score, group=Variable))+
  geom_line(size=1.2, aes(color=Variable), stat="identity")+
  labs(title = "Sentiment", x=NULL, y=NULL)+
  scale_colour_manual(values=c("Orange"," Red", "Darkgreen","Grey"))

to go further

Own-made dictionnary

Just building list of words and given them a weight.

(we could improve the list of word and test availability of regex formulas)

my_text <- df$description
method <- "custom"
custom_lexicon <- data.frame(word=c("impot", "impôt","impots", "impôts", "taxe","taxes", "fisc", "fiscal", "fiscales", " fiscaux", "fiscalité", "redevance"),
                             value=c(1,1,1,1,1,1,1,1,1,1,1,1))
custom_distrib <- get_sentiment(my_text, method = method, lexicon = custom_lexicon)

custom_distrib<-as.data.frame(custom_distrib)
ggplot(custom_distrib,aes(x=custom_distrib))+geom_histogram()+scale_y_log10()

foo<-cbind(df,custom_distrib)

foo1<-foo %>% 
  mutate(Year=year(date_ecrit), Month=month(date_ecrit), date=my(paste(Month, Year)))%>%
  group_by(date) %>%
  summarise(custom_distrib=mean(custom_distrib, na.rm=TRUE))

ggplot(foo1,aes(x=date, y=custom_distrib))+
  geom_line(size=1.2,stat="identity")+
  labs(title = "Fisc", x=NULL, y=NULL)+
  scale_colour_manual(values=c("Orange"," Red", "Darkgreen","Grey"))

Word level analysis

Annotation are apply at the level of words : lexical (stem, lemma, Wordnet) and syntactic annotations ( PoS, syntaxis dependences).

#togofurther with wordnet

Tokens

The very first step. Useful to set the vocabulary

#df$text<-str_replace(df$text, "\\w+", "J ") # trouver la solution!!!! pour le '

corpus<-corpus(df,id_field = "id",text_field = "description")
foo<-tokens(corpus,remove_punct = TRUE, remove_symbols=TRUE, remove_numbers=TRUE)%>%
  tokens_remove(stopwords("french"))

head(foo,5)

## Tokens consisting of 5 documents and 9 docvars.
## text1 :
##  [1] "J'ai"      "accouché"  "février"   "fille"     "n'est"     "encore"   
##  [7] "inscrite"  "CAF"       "Pourtant"  "j'ai"      "fait"      "démarches"
## [ ... and 41 more ]
## 
## text2 :
##  [1] "J'ai"       "envoyé"     "messages"   "attentes"   "réponse"   
##  [6] "CAF"        "demande"    "date"       "autre"      "date"      
## [11] "S'agissant" "questions" 
## [ ... and 7 more ]
## 
## text3 :
##  [1] "J'ai"    "fait"    "demande" "RSA"     "ligne"   "trompé"  "cases"  
##  [8] "m'a"     "répondu" "n'avais" "droit"   "RSA"    
## [ ... and 53 more ]
## 
## text4 :
##  [1] "J'ai"      "déménagé"  "l'Hérault" "Gard"      "mois"      "janvier"  
##  [7] "dossier"   "n'a"       "toujours"  "traité"    "alors"     "j'ai"     
## [ ... and 11 more ]
## 
## text5 :
##  [1] "J'habite"     "village"      "bout"         "département"  "permanence"  
##  [6] "CAF"          "après"        "avoir"        "subi"         "l'annulation"
## [11] "rendez-vous"  "permanence"  
## [ ... and 147 more ]

foo1 <-unlist_tokens(foo) 
dim(foo1)

## [1] 869021      2

foo2<-foo1 %>% 
  group_by(token)%>%
  summarise(n=n())%>%
              mutate(rank=rank(desc(n)))
                     
dim(foo2)

## [1] 44512     3

ggplot(foo2, aes(x=rank,y=n))+
  geom_point(alpha=.2)+geom_smooth(method=lm)+
  scale_x_log10()+
  scale_y_log10()+
  labs(title = "Zipf like")

#with cleaning

dfmat1 <- dfm(foo,
              remove = stopwords("french"), remove_punct = TRUE) %>%
   dfm_trim(min_termfreq = 3)

textplot_wordcloud(dfmat1, max_words = 50)

dfmat2 <- dfm(corpus_subset(corpus, intitule == "CAF"),
              remove = stopwords("french"), remove_punct = TRUE) %>%
   dfm_trim(min_termfreq = 3)

textplot_wordcloud(dfmat2, max_words = 150)

ggsave("./Images/worcloud.jpeg", width = 28, height = 20, units = "cm")

collocation

looking for expressions

#on sélectionne les mots commençant par une majuscule
toks_cap <- tokens_select(foo, 
                               pattern = "^[A-Z]",
                               valuetype = "regex",
                               case_insensitive = FALSE, 
                               padding = TRUE)

#on cherche les collocations
tstat_col_cap <- textstat_collocations(toks_cap, min_count = 3, tolower = FALSE)
#head(as.data.frame(tstat_col_cap),15)

toks_comp <- tokens_compound(foo, pattern = tstat_col_cap[tstat_col_cap$lamba > 10], 
                             case_insensitive = FALSE)

head(toks_comp)

## Tokens consisting of 6 documents and 9 docvars.
## text1 :
##  [1] "J'ai"      "accouché"  "février"   "fille"     "n'est"     "encore"   
##  [7] "inscrite"  "CAF"       "Pourtant"  "j'ai"      "fait"      "démarches"
## [ ... and 41 more ]
## 
## text2 :
##  [1] "J'ai"       "envoyé"     "messages"   "attentes"   "réponse"   
##  [6] "CAF"        "demande"    "date"       "autre"      "date"      
## [11] "S'agissant" "questions" 
## [ ... and 7 more ]
## 
## text3 :
##  [1] "J'ai"    "fait"    "demande" "RSA"     "ligne"   "trompé"  "cases"  
##  [8] "m'a"     "répondu" "n'avais" "droit"   "RSA"    
## [ ... and 53 more ]
## 
## text4 :
##  [1] "J'ai"      "déménagé"  "l'Hérault" "Gard"      "mois"      "janvier"  
##  [7] "dossier"   "n'a"       "toujours"  "traité"    "alors"     "j'ai"     
## [ ... and 11 more ]
## 
## text5 :
##  [1] "J'habite"     "village"      "bout"         "département"  "permanence"  
##  [6] "CAF"          "après"        "avoir"        "subi"         "l'annulation"
## [11] "rendez-vous"  "permanence"  
## [ ... and 147 more ]
## 
## text6 :
##  [1] "J'avais"      "rendez-vous"  "CAF"          "Gard"         "Celui-ci"    
##  [6] "s'est"        "très"         "bien"         "passé"        "explications"
## [11] "claires"      "concises"    
## [ ... and 2 more ]

Lemmas and POS

library(udpipe)
fr <- udpipe_download_model(language = "french")
udmodel_french <- udpipe_load_model(file = "french-gsd-ud-2.5-191206.udpipe")

UD <- udpipe_annotate(udmodel_french, x=df$description, trace =1000,parallel.cores = 4)

## 2022-10-30 16:26:19 Annotating text fragment 1/16576
## 2022-10-30 16:29:31 Annotating text fragment 1001/16576
## 2022-10-30 16:33:04 Annotating text fragment 2001/16576
## 2022-10-30 16:36:20 Annotating text fragment 3001/16576
## 2022-10-30 16:39:08 Annotating text fragment 4001/16576
## 2022-10-30 16:41:43 Annotating text fragment 5001/16576
## 2022-10-30 16:44:15 Annotating text fragment 6001/16576
## 2022-10-30 16:46:45 Annotating text fragment 7001/16576
## 2022-10-30 16:48:55 Annotating text fragment 8001/16576
## 2022-10-30 16:51:08 Annotating text fragment 9001/16576
## 2022-10-30 16:53:29 Annotating text fragment 10001/16576
## 2022-10-30 16:55:42 Annotating text fragment 11001/16576
## 2022-10-30 16:58:07 Annotating text fragment 12001/16576
## 2022-10-30 17:01:17 Annotating text fragment 13001/16576
## 2022-10-30 17:03:50 Annotating text fragment 14001/16576
## 2022-10-30 17:06:15 Annotating text fragment 15001/16576
## 2022-10-30 17:08:33 Annotating text fragment 16001/16576

UD <- as.data.frame(UD)
saveRDS(UD, "./Data/UD.rds")

Let examine the content : lemma per POS.

UD<-readRDS("./Data/UD.rds")
foo<-UD%>%
  group_by(upos)%>%
  summarise(n=n())%>%
  ggplot(aes(x=reorder(upos,n), y=n))+
  geom_bar(stat = "identity")+coord_flip()

foo

foo<-UD %>%
  filter(upos=="NOUN") %>%
  group_by(lemma)%>%
  summarise(n=n()) %>%
  filter(n>1000)%>%
  ggplot(aes(x=reorder(lemma,n), y=n))+
  geom_point(size=1.5, fill="blue3")+coord_flip()

foo

foo<-UD %>%
  filter(upos=="ADJ" |upos=="VERB") %>%
  group_by(lemma, upos)%>%
  summarise(n=n()) %>%
  filter(n>300)%>%
  ggplot(aes(label = lemma, size = log(n), group=upos)) +
  geom_text_wordcloud(aes(color=upos)) +
  theme_minimal()+
  facet_wrap(vars(upos))

foo

Syntaxic dependance

A few `dplyr gymnastics’, and thats already the session 2 .

We follow for idea of code.

for dataviz igraph is the tool.

foo1<-UD %>%
  mutate(id=paste0(doc_id,paragraph_id,sentence_id,token_id))%>%
  select(id, lemma)%>%
  rename(noun=lemma)

foo<-UD %>%
  filter(dep_rel=="amod")%>%
  mutate(id=paste0(doc_id,paragraph_id,sentence_id,head_token_id))%>%
  left_join(foo1)%>%
  select(id, lemma, noun)%>%
  rename(adj=lemma)%>%
  group_by(noun,adj)%>%
  summarise(n=n())%>%
  filter(n>50)
#A  Correspondance Analysis solution?
#igraph approach belong to the netx lesson 
library(igraph)
g <- graph.data.frame(foo, directed=FALSE)
V(g)$type <- bipartite_mapping(g)$type  ## Add the "type" attribute
V(g)$label.color <- ifelse(V(g)$type, "salmon4", "blue2")
V(g)$fill <- ifelse(V(g)$type, "salmon4", "blue2")
V(g)$shape <-ifelse(V(g)$type, "circle", "square")
plot(g,vertex.label.cex = 0.8)

Notes

Beware to computing time! Best to sample for testing. (come back to the beginning)

t2=Sys.time()
t<- t2-t1
print(t)

## Time difference of 1.058051 hours

We save the annotated dataset in a format *.rds for further usage.

df_work<-cbind(df,readability, lexdiv,emotions,test)
write_rds(df_work,"./Data/df_work.rds")

See you tomorrow and go to session 2

Some exercises before, for training :

compute the sentiment evolution for one administration of your choice.
…

Session1 : text metric and word annotations

cb,jcrm,bc,oc

2022-10-30