uturoa

L’objet de cette étude est de comparer la production langagière des contributeurs de la plateforme TripAdvisor. On y examine l’ensemble des avis disponibles en français sur la quasi totalité des hôtels de polynésie et produits aux cours des quatre dernières années.On explore de manière systématique l’influence de l’environnement ( les iles et le temps), du type d’hôtel (nombre de chambre et prix), de l’expérience rédactionnelle des contributeurs et de leur jugement déclaré (notes) sur un certain nombre de caractéristique du texte des avis en particulier leur volumétrie et sa distribition, la valeur par les compétences langagières (lisibilité et diversité lexicales), la valence du sentiment éprouvé, le vecu de l’expérience aux travers de marqueurs sémantiques et enfin de l’expérience éprouvée et enfin les variantes thématiques qui transpirent des textes.

L’hypothèse centrale est que les hôtels en discriminant les clients sur un critère de relative richesse induisent des contenus variables en quantité et en qualité.

Analyse quantitative des commentaires

Il s’agit d’établir les statistiques de base et d’examiner la distribution du volume de texte produit.

L’enchainement des annotations et leurs propriétés

Volume des avis

Il s’agit d’évaluer la distribution “physique”du corpus de texte défini comme l’ensemble des commentaires tripadvisor de la version française publiés à partir de juin 2015. On obtient 26467 textes. dim(comment_X)

print("nombre total de documents")

## [1] "nombre total de documents"

dim(comment_W)

## [1] 70235     8

#filtrage sur date et sur comment_Ws notés
d<- as.Date("2015-06-30", format = "%Y-%m-%d")
comment_W<-comment_W %>% filter(Date>d)  %>% filter(Note>0)
print("nombre de documents (depuis 4 ans)")

## [1] "nombre de documents (depuis 4 ans)"

dim(comment_W)

## [1] 26467     8

#on enrichit avec les données hôtels
comment_W<-left_join(comment_W, hotels_ref)

comment_W$destination[comment_W$Ile=="Avatoru"]<-"Tuamotu"
comment_W$destination[comment_W$Ile=="Bora Bora"]<-"Bora Bora"
comment_W$destination[comment_W$Ile=="Hiva hoa"]<-"Marquises"
comment_W$destination[comment_W$Ile=="Hiva Oa"]<-"Marquises"
comment_W$destination[comment_W$Ile=="Huahine"]<-"Iles vent"
comment_W$destination[comment_W$Ile=="Maupiti"]<-"Iles vent"
comment_W$destination[comment_W$Ile=="Tahaa"]<-"Iles vent"
comment_W$destination[comment_W$Ile=="Raiatea"]<-"Iles vent"
comment_W$destination[comment_W$Ile=="Moorea"]<-"Moorea"
comment_W$destination[comment_W$Ile=="Tahiti"]<-"Tahiti"
comment_W$destination[comment_W$Ile=="Tetiaroa"]<-"Tahiti"
comment_W$destination[comment_W$Ile=="Nuku Hiva"]<-"Marquises"
comment_W$destination[comment_W$Ile=="Raivavae"]<-"Australes"
comment_W$destination[comment_W$Ile=="Rangiroa"]<-"Tuamotu"
comment_W$destination[comment_W$Ile=="Rurutu"]<-"Australes"
comment_W$destination[comment_W$Ile=="Tikehau"]<-"Tuamotu"

comment_W$destination<- as.factor(comment_W$destination)
comment_W$categorie<- as.factor(comment_W$categorie)

comment_W$chambres<-as.numeric(comment_W$chambres)

comment_W$Taille_hotel[comment_W$chambres<9]<-" 1-5 chambres"
comment_W$Taille_hotel[comment_W$chambres>8 & comment_W$chambres<16]<-" 6-15 chambres"
comment_W$Taille_hotel[comment_W$chambres>15 & comment_W$chambres<51]<-"15-50 chambres"
comment_W$Taille_hotel[comment_W$chambres>50 & comment_W$chambres<121]<-"50-80 chambres"
comment_W$Taille_hotel[comment_W$chambres>120]<-"80 et plus chambres"


comment_W$prix_classe[comment_W$prix<10000]<-"<10 000"
comment_W$prix_classe[comment_W$prix>9999 & comment_W$prix<15000]<-"<15 000"
comment_W$prix_classe[comment_W$prix>14999 & comment_W$prix<25000]<-"<25 000"
comment_W$prix_classe[comment_W$prix>24999 & comment_W$prix<40000]<-"<40 000"
comment_W$prix_classe[comment_W$prix>39999 ]<-">40 000"

library(AMR)
comment_W %>% freq(Taille_hotel)

comment_W %>% freq(prix_classe)

comment_W %>% freq(destination)

library(FactoMineR)
AFCM<-subset(comment_W, select=c("prix_classe","Taille_hotel","categorie"))
AFCM<- na.omit(AFCM)
MCA(AFCM, ncp = 3, graph = TRUE,method = "Burt")

## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 6168 individuals, described by 3 variables
## *The results are available in the following objects:
## 
##    name              description                       
## 1  "$eig"            "eigenvalues"                     
## 2  "$var"            "results for the variables"       
## 3  "$var$coord"      "coord. of the categories"        
## 4  "$var$cos2"       "cos2 for the categories"         
## 5  "$var$contrib"    "contributions of the categories" 
## 6  "$var$v.test"     "v-test for the categories"       
## 7  "$ind"            "results for the individuals"     
## 8  "$ind$coord"      "coord. for the individuals"      
## 9  "$ind$cos2"       "cos2 for the individuals"        
## 10 "$ind$contrib"    "contributions of the individuals"
## 11 "$call"           "intermediate results"            
## 12 "$call$marge.col" "weights of columns"              
## 13 "$call$marge.li"  "weights of rows"

Distribution du nombre d’avis entre les hôtels

On examine la répartition du nombre d’avis produit entre les hôtels. On s’aperçoit d’une distribution très inégales : une dizaine d’hôtels concentrent des milliers d’avis quand le reste se contentent de quelques dizaines. La courbe de lorenz confirme l’observation : 75% des commentaires sont produit par 12,5% des hôtels.

#on calcule le nombre d'avis par logement
comment_W$Hotel<-as.factor(comment_W$Hotel)

Avis<-comment_W %>% mutate(n=1) %>% group_by(Hotel) %>% summarise(nb_avis = sum(n))

mean<-round(mean(Avis$nb_avis),1)
median<-median(Avis$nb_avis)
max<- max(Avis$nb_avis)
g05a<-ggplot(Avis, aes(x=nb_avis))+geom_histogram(binwidth=20,fill="coral3")+theme_minimal()+xlim(0,2500)+ylim(0,25)+annotate("text", x=1000, y=20, size=3,label= paste0("moyenne=",mean,"- médiane=", median,"- max=",max))+labs(x = "nombre d'avis", y = "Fréquences (nb d'hôtels)", title = "Distribution du nombre d'avis par hôtel", caption = "")

#on analyse la concentration
library(ineq)
library(gglorenz)
gini<-round(ineq(Avis$nb_avis,type = c("Gini")),2)

g05b<-Avis %>%
    ggplot(aes(nb_avis)) +
    stat_lorenz(desc = TRUE,size=1.2,color="darkred") +
    coord_fixed() +
    geom_abline(linetype = "dashed") +
    theme_minimal() +labs(x = "Part cumulée des hôtels",
         y = "Part cumulée des avis",
         title = "Concentration des avis",
         caption = "") +
  annotate("text", x=.35, y=.6, size=3,label= paste0("indice de Gini=",gini))
grid.arrange(g05a,g05b,ncol=2)

La longueur des avis

Les avis ne sont pas de la même longueur (en nombre de caractères) et varient de 193 caractères à 13637 caractères avec une longueur médiane de 503. le volume total représente donc 18,49 millions de caractères, soit l’équivalent de 10 000 pages.

comment_W$nbcar<-nchar(comment_W$Commetaire)
mean <-round(mean(comment_W$nbcar),0)
median <-median(comment_W$nbcar)
median

## [1] 502

min<- min(comment_W$nbcar)
min

## [1] 193

max<- max(comment_W$nbcar)
max

## [1] 13637

#nombre total de caractères ( en kilo)

sum<- sum(comment_W$nbcar)/1000
sum

## [1] 18493.31

Une analyse de concentration montre que 25% des avis les plus longs représentent 50% du volume de texte avec un indice de gini de 0.39 qui indique une concentration modérée.

g00a<- comment_W  %>% ggplot(aes(x=nbcar))+geom_histogram(binwidth=50,fill="brown")+theme_minimal()+xlim(0,4000)+ylim(0,3500)+annotate("text", x=2000, y=2900, size=3,label= paste0("moy=",mean," - médiane", median," - max=",max))+labs(x = "nombre de caractéres", y = "Fréquence")

#on analyse la concentration
#library(ineq)
#library(gglorenz)
gini<-round(ineq(comment_W$nbcar,type = c("Gini")),2)

g00b<-comment_W %>%
    ggplot(aes(nbcar)) +
    stat_lorenz(desc = TRUE,size=1.2,color="darkred") +
    coord_fixed() +
    geom_abline(linetype = "dashed") +
    theme_minimal() +labs(x = "Part cumulée des commentaires",
         y = "Part cumulée du texte",
         title = "Concentration du volume de texte",
         caption = "") +
  annotate("text", x=.35, y=.6, size=3,label= paste0("indice de Gini=",gini))
grid.arrange(g00a,g00b,ncol=2)

La corrélation entre le nombre d’avis par hôtel et la taille moyenne des avis est faible <0.3 mais significative. Les grands hôtels ont aussi les commentaires les plus longs. On passe de 600 caractères à près de 800 pour les hôtels les plus commentés.

Le nombre de avis par chambre , une manière de saisir l’intensité du flux d’avis généré dans un hotel n’est pas corrélé à la taille de l’hotel, et cette corrélation était significative (de l’ordre de moins de -0,2), elle montre que les petits hotels sont plus productifs (est-ce qu’ils concentrent le plus d avis négatifs ?)

comment_W$nb_avis<-1
nb_comment_p_hotel<- aggregate(cbind(nbcar,nb_avis) ~ Hotel, data = comment_W, FUN= "sum")
nb_ch_p_hotel<- aggregate(chambres ~ Hotel, data = comment_W, FUN= "mean")

nb_comment_p_hotel<-merge(nb_comment_p_hotel,nb_ch_p_hotel)

#longueur par avis
nb_comment_p_hotel$nbcar_moy<-nb_comment_p_hotel$nbcar/nb_comment_p_hotel$nb_avis

g00c<- ggplot(nb_comment_p_hotel,aes(x=nb_avis, y=nbcar_moy))+geom_point()+theme_minimal()+scale_x_log10()+geom_smooth(method = "lm")+geom_smooth(method = "loess",color="orange2")

m <- lm(log(nbcar_moy) ~ log(nb_avis), data = nb_comment_p_hotel)
summary(m)

## 
## Call:
## lm(formula = log(nbcar_moy) ~ log(nb_avis), data = nb_comment_p_hotel)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.56852 -0.15628 -0.01719  0.13249  0.68930 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.27428    0.05251 119.478  < 2e-16 ***
## log(nb_avis)  0.04160    0.01235   3.368 0.000928 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2295 on 178 degrees of freedom
## Multiple R-squared:  0.05991,    Adjusted R-squared:  0.05463 
## F-statistic: 11.34 on 1 and 178 DF,  p-value: 0.0009283

eq <- substitute(italic(y) == a + b %.% italic(nb_avis)*","~~italic(r)^2~"="~r2,
                list(        a = format(coef(m)[1], digits = 4),
                               b = format(coef(m)[2], digits = 2),
                               r2 = format(summary(m)$r.squared, digits = 3)))

dftext <- data.frame(nb_avis = 20, nbcar_moy = 1500, eq = as.character(as.expression(eq)))

g00d<-g00c+ geom_text(aes(label = eq), data = dftext, parse = TRUE)
g00d

# avis par chambre et nombre de chambre
#nbre d'avis par chambres
nb_comment_p_hotel$avis_p_ch<-nb_comment_p_hotel$nb_avis/nb_comment_p_hotel$chambres

g00e<- ggplot(nb_comment_p_hotel,aes(x=chambres, y=avis_p_ch))+geom_point()+theme_minimal()+scale_x_log10()+geom_smooth(method = "lm")+geom_smooth(method = "loess",color="orange2")+scale_y_log10()

m <- lm(log(avis_p_ch) ~ log(chambres), data = nb_comment_p_hotel)
summary(m)

## 
## Call:
## lm(formula = log(avis_p_ch) ~ log(chambres), data = nb_comment_p_hotel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3189 -0.6143  0.1580  0.6444  2.7325 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.27903    0.15859  14.370   <2e-16 ***
## log(chambres) -0.16658    0.06671  -2.497   0.0134 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.016 on 178 degrees of freedom
## Multiple R-squared:  0.03384,    Adjusted R-squared:  0.02842 
## F-statistic: 6.235 on 1 and 178 DF,  p-value: 0.01343

eq <- substitute(italic(y) == a + b %.% italic(nb_avis)*","~~italic(r)^2~"="~r2,
                list(        a = format(coef(m)[1], digits = 4),
                               b = format(coef(m)[2], digits = 2),
                               r2 = format(summary(m)$r.squared, digits = 3)))

dftext <- data.frame(avis_p_ch = 50, chambres = 100, eq = as.character(as.expression(eq)))

g00f<-g00e+ geom_text(aes(label = eq), data = dftext, parse = TRUE)
g00f

density1<-lm(log(avis_p_ch)~log(chambres),data=nb_comment_p_hotel)
summary(density1)

## 
## Call:
## lm(formula = log(avis_p_ch) ~ log(chambres), data = nb_comment_p_hotel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3189 -0.6143  0.1580  0.6444  2.7325 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.27903    0.15859  14.370   <2e-16 ***
## log(chambres) -0.16658    0.06671  -2.497   0.0134 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.016 on 178 degrees of freedom
## Multiple R-squared:  0.03384,    Adjusted R-squared:  0.02842 
## F-statistic: 6.235 on 1 and 178 DF,  p-value: 0.01343

density2<-lm(nbcar_moy~log(chambres),data=nb_comment_p_hotel)
summary(density2)

## 
## Call:
## lm(formula = nbcar_moy ~ log(chambres), data = nb_comment_p_hotel)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -305.73  -95.00  -11.58   60.78  528.88 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    563.637     23.491  23.993  < 2e-16 ***
## log(chambres)   38.996      9.881   3.946 0.000114 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 150.6 on 178 degrees of freedom
## Multiple R-squared:  0.08045,    Adjusted R-squared:  0.07529 
## F-statistic: 15.57 on 1 and 178 DF,  p-value: 0.0001141

les contributions des hôtels

comment_W$n<-1
hotel01<-aggregate(n~nom,comment_W, FUN="sum")
hotel02<-aggregate(nbcar~nom,comment_W, FUN="sum")
hotel03<-aggregate(nbcar~nom,data=comment_W, FUN="mean") %>% mutate(nbcar_mean=nbcar) %>% dplyr::select(-nbcar)
Hotela<-merge(hotel01,hotel02)
Hotelb<-merge(Hotela, hotel03) %>% filter(n>100)

g01a<-ggplot(Hotelb,aes(x=reorder(nom,n),y=n))+geom_bar(stat="identity",fill="coral3")+coord_flip()+theme_minimal()+labs(x = "Hotels (par nombre d'avis)", y = "nombre d'avis", title = "", caption = "")+theme(axis.text=element_text(size=7))

g01b<-ggplot(Hotelb,aes(x=reorder(nom,n),y=nbcar))+geom_bar(stat="identity",fill="coral3")+coord_flip()+theme_minimal()+labs(x = "Hotels (par nombre d'avis)", y = "nombre total de caractères", title = "", caption = "")+theme(axis.text=element_text(size=7))

g01a

g01b

Les “scripteurs”

Les auteurs d’avis, nous les dénommerons “scripteurs”, peuvent produire plus d’un avis, d’autant plus que pour la destination étudiées les séjours se déroulent souvent dans différents établissements. On s’attend qu’en plus de commentaires uniques, nous ayons des “scripteurs” répéteurs. C’est bien ce que les données confirment. Les 26000 commentaires sont produit par 11000 comptes uniques, 5000 d’entre eux n’ont produit qu’un avis, 6000 en ayant produit au moins deux. Les scripteurs occasionnels cependant représent 3,5 millions de caractère donc moins d’un quart de toute la production.

Trip advisor agrège dont moins des commentaires épars et occasionnel qu’une production répétée de quelques milliers de touristes. Sachant que la population de touriste correspondant à 60 000 francophone annuels, les contributeurs représentent 6 000/240 000 = 2,5% de la population totale des toruistes. On s’aperçoit que le nombre d’avis produit par compte est indépendant de la taille des avis.

comment_W$nb_comment<-1
nb_comment_p_user<- aggregate(cbind(nbcar,nb_comment) ~ Username, data = comment_W, FUN= "sum")
g02a<-ggplot(nb_comment_p_user,aes(x=nb_comment))+geom_histogram(fill="Orange3",binwidth = 1)+theme_minimal()
dim(nb_comment_p_user)

## [1] 11680     3

nb_comment_p_user2<-aggregate(nbcar~nb_comment,nb_comment_p_user, FUN="sum")
g02b<-ggplot(nb_comment_p_user2,aes(x=nb_comment,y=nbcar))+geom_bar(stat="identity",fill="Orange2")+theme_minimal()+labs(x="nombre de commentaires par scripteur")

grid.arrange(g02a, g02b, ncol=2)

nb_comment_p_user$taille_avis<-nb_comment_p_user$nbcar/nb_comment_p_user$nb_comment
g02c<-ggplot(nb_comment_p_user, aes(x=nb_comment,y=log(taille_avis)))+geom_point()+geom_smooth(methode="lm")+theme_minimal()+labs(x="nombre de commentaires par scripteur")
g02c

comment_W<-left_join(comment_W, nb_comment_p_user,by ="Username") %>%select(-nb_comment.x)%>%mutate(ncar_user=nbcar.y/nb_comment.y)

comment_W$redacteur[comment_W$nb_comment.y<2]<-"1 avis"
comment_W$redacteur[comment_W$nb_comment.y>1 & comment_W$nb_comment.y<4]<-"2 ou 3 avis"
comment_W$redacteur[comment_W$nb_comment.y>3 & comment_W$nb_comment.y<9]<-"4 à 8 avis"
comment_W$redacteur[comment_W$nb_comment.y>8 ]<-"9 et plus avis"

Evolution de la production

On remarquera la saisonnalité, mais aussi une chute qui doit pouvoir être attribué au politiques de detections de fake. Une étude des initiative doit être menée.

comment_W$Month_Yr<-as.POSIXct(comment_W$Date) #un format qu'aime lubridate
g<-comment_W %>% 
  ggplot(aes(Month_Yr)) + 
  geom_freqpoly(binwidth = 604800) + labs(title = "Nombre d'avis par semaine",caption = "Tripadvisor polynésie",x="temps",y="nb d'avis") +theme_minimal()
g

Volume en unités et nombre de caractères

On commence par définir nos critères de segmentation

comment_W$chambres<-as.numeric(comment_W$chambres)
ggplot(comment_W,aes(chambres))+geom_histogram(binwidth=5) +theme_minimal()

comment_W$Taille_hotel[comment_W$chambres<9]<-" 1-5 chambres"
comment_W$Taille_hotel[comment_W$chambres>8 & comment_W$chambres<16]<-" 6-15 chambres"
comment_W$Taille_hotel[comment_W$chambres>15 & comment_W$chambres<51]<-"15-50 chambres"
comment_W$Taille_hotel[comment_W$chambres>50 & comment_W$chambres<121]<-"50-80 chambres"
comment_W$Taille_hotel[comment_W$chambres>120]<-"80 et plus chambres"

comment_W$redacteur[comment_W$nb_comment.y<2]<-"1 avis"
comment_W$redacteur[comment_W$nb_comment.y>1 & comment_W$nb_comment.y<4]<-"2 ou 3 avis"
comment_W$redacteur[comment_W$nb_comment.y>3 & comment_W$nb_comment.y<9]<-"4 à 8 avis"
comment_W$redacteur[comment_W$nb_comment.y>8 ]<-"9 et plus avis"


comment_W$prix_classe[comment_W$prix<10000]<-"<10 000"
comment_W$prix_classe[comment_W$prix>9999 & comment_W$prix<15000]<-"<15 000"
comment_W$prix_classe[comment_W$prix>14999 & comment_W$prix<25000]<-"<25 000"
comment_W$prix_classe[comment_W$prix>24999 & comment_W$prix<40000]<-"<40 000"
comment_W$prix_classe[comment_W$prix>39999 ]<-">40 000"

library(AMR)
comment_W %>% freq(Taille_hotel)

comment_W %>% freq(prix_classe)

comment_W %>% freq(redacteur)

comment_W %>% freq(destination)

library(FactoMineR)
AFCM<-subset(comment_W, select=c("prix_classe","Taille_hotel","categorie"))
AFCM<- na.omit(AFCM)
MCA(AFCM, ncp = 2, graph = TRUE)

## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 6168 individuals, described by 3 variables
## *The results are available in the following objects:
## 
##    name              description                       
## 1  "$eig"            "eigenvalues"                     
## 2  "$var"            "results for the variables"       
## 3  "$var$coord"      "coord. of the categories"        
## 4  "$var$cos2"       "cos2 for the categories"         
## 5  "$var$contrib"    "contributions of the categories" 
## 6  "$var$v.test"     "v-test for the categories"       
## 7  "$ind"            "results for the individuals"     
## 8  "$ind$coord"      "coord. for the individuals"      
## 9  "$ind$cos2"       "cos2 for the individuals"        
## 10 "$ind$contrib"    "contributions of the individuals"
## 11 "$call"           "intermediate results"            
## 12 "$call$marge.col" "weights of columns"              
## 13 "$call$marge.li"  "weights of rows"

Notes et longueur des commentaires

Une relation spectaculaire se manifeste : les avis les moins bien noté sont plus longs de plus de 40% que les avis bien notés. L’insatisfaction rend logorique.

g04b<-ggplot(comment_W,aes(x=Note))+geom_histogram(fill="Orange4",binwidth = 1)+theme_minimal()+labs(x="Note",y = "Fréquence",title = "Distribution des notes")
comment_W$note_avis<-as.factor(comment_W$Note)
foo<- aggregate(cbind(nbcar.x)~note_avis,data=comment_W,FUN="median")
foo<-melt(foo)
g04c<-ggplot(foo, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+labs(x="Note",y = "Longueur de l'avis (en carac.)",title = "Longueur médiane des avis", caption = "")+theme(legend.position = "none")+ylim(300,800)
grid.arrange(g04b,g04c, ncol=2)

On peut chercher à modéliser la taille des avis. Un modèle à composantes d’erreur pour prendre en compte

g04a<- comment_W %>% ggplot(aes(x=Note,y=nbcar.x ))+geom_point(position ="jitter")+theme_minimal()+geom_smooth(method='lm')+geom_smooth(method='loess')+scale_y_log10()
g04a

library(lme4)
reg00<-lmer(nbcar.x~Note+(1 | Hotel), data=comment_W)
reg01<-lmer(nbcar.x~Note+log(prix)+nb_comment.y+Taille_hotel+(1 | Hotel), data=comment_W)
reg02<-lmer(nbcar.x~Note+log(prix)+nb_comment.y+Taille_hotel+destination+(1 | Hotel), data=comment_W)

library(stargazer)
stargazer(reg00,reg01,reg02, type="text")

## 
## ======================================================================
##                                          Dependent variable:          
##                                 --------------------------------------
##                                                nbcar.x                
##                                     (1)          (2)          (3)     
## ----------------------------------------------------------------------
## Note                            -116.843***  -117.473***  -117.503*** 
##                                   (4.391)      (4.405)      (4.412)   
##                                                                       
## log(prix)                                     144.624***   139.381*** 
##                                                (15.298)     (16.847)  
##                                                                       
## nb_comment.y                                  -5.272***    -5.181***  
##                                                (1.453)      (1.455)   
##                                                                       
## Taille_hotel 6-15 chambres                     -31.670      -35.586   
##                                                (27.726)     (28.850)  
##                                                                       
## Taille_hotel15-50 chambres                    -91.388***   -84.928*** 
##                                                (28.197)     (30.250)  
##                                                                       
## Taille_hotel50-80 chambres                     -35.579      -51.024   
##                                                (34.555)     (38.600)  
##                                                                       
## Taille_hotel80 et plus chambres                -14.737      -13.611   
##                                                (48.857)     (55.297)  
##                                                                       
## destinationBora Bora                                         56.674   
##                                                             (80.375)  
##                                                                       
## destinationIles vent                                         9.596    
##                                                             (75.643)  
##                                                                       
## destinationMarquises                                         -7.630   
##                                                             (84.184)  
##                                                                       
## destinationMoorea                                            3.729    
##                                                             (77.456)  
##                                                                       
## destinationTahiti                                            17.975   
##                                                             (78.356)  
##                                                                       
## destinationTuamotu                                           5.582    
##                                                             (76.917)  
##                                                                       
## Constant                        1,156.411***   -238.797     -199.519  
##                                   (22.343)    (147.968)    (177.876)  
##                                                                       
## ----------------------------------------------------------------------
## Observations                       26,467       25,947       25,947   
## Log Likelihood                  -207,974.100 -203,636.200 -203,607.900
## Akaike Inf. Crit.               415,956.200  407,292.400  407,247.800 
## Bayesian Inf. Crit.             415,988.900  407,374.000  407,378.400 
## ======================================================================
## Note:                                      *p<0.1; **p<0.05; ***p<0.01

Lisibilité

La lisibilité est une vieille question L. A. Sherman found that the English sentence was getting shorter. In Elizabethan times, the average sentence was 50 words long. In his own time, it was 23 words long. Sherman’s work established that: Literature is a subject for statistical analysis, Shorter sentences and concrete terms help people to make sense of what is written. Over time, text becomes easier if it is more like speech.

Dans une étude de Marc hug d’une centaine article du monde, le nombre de syllabes moyen est de 3,5 syllabes (phonémes) et de 19 mots par phrase. Ici les moyennes sont beaucoups plus faible avec 15 mots et 1,5 syllabes indiquant un niveau de language bien inférieur au journal le Monde. Reste à apprecier l’importance de l’écart.

In 1943, Rudolf Flesch published his PhD dissertation, Marks of a Readable Style, which included a readability formula to predict the difficulty of adult reading material.

Flesch-Kincaid Readability Score (Flesch and Kincaid 1975).0.39 * ASL + 11.8 * (NSy /Nw) - 15.59 https://en.wikipedia.org/wiki/Readability https://rdrr.io/cran/quanteda/man/textstat_readability.html

On le garde comme historique et on ajoute le ARI et le coleman.Liau Grade qui donne le niveau d’âge à partir duquel le contenu peu être lu.

#library(quanteda)
Corpus<-corpus(comment_W,text_field="Commetaire") #corpus de base qui sera filtré

read<- textstat_readability(Corpus, measure = c("ARI","Flesch.Kincaid", "Coleman.Liau.ECP","Coleman.Liau.grade", "meanSentenceLength", "meanWordSyllables"),  remove_hyphens = TRUE,  min_sentence_length =  000, max_sentence_length = 10000,  intermediate = FALSE)

comment_W<-cbind(comment_W,read) 

g06a<-ggplot(data = comment_W, aes(x=Coleman.Liau.grade))+geom_histogram(fill="grey")+theme_minimal()
g06b<-ggplot(data = comment_W, aes(x=ARI))+geom_histogram(fill="grey")+theme_minimal()+xlim(-10,50)
g06c<-ggplot(data = comment_W, aes(x=meanWordSyllables))+geom_histogram(fill="grey")+theme_minimal()+xlim(0,3)
g06d<-ggplot(data = comment_W, aes(x=meanSentenceLength))+geom_histogram(fill="grey")+theme_minimal()+xlim(0,50)
grid.arrange(g06a,g06b,g06c,g06d,nrow=2) #utilise gridextra

Si on examine les correlations on s’apercoit 1) que la longueur des phrase determine largement le ari et le FK 2) en revanche le nombre de syllabes est mieux lié au CLG 3) les mesures sont insensibles à la longueur des avis. Court ou long ils sont autant lisibles. Par conséquent les commentaires négatifs, plus long ne sont pas moins lisibles que les avis laudateurs. 4) l’ARI et FK sont redondant , on ne garde que le ARI qui semble est plus dépendante à la complexité grammaticale (longueur des phrases) 5) le CLG est assez lié aux deux autres mais garde une indépendance, il mesure plus la simplicité/complexité du vocabulaire.

Diversité lexicale

TTR le plus simple mais sensible mais la longueur, le CTTR corrige, le maas aussi plus moderne avec herdan corrige en principe mieux ( ref :).

#library(quanteda)
toks <- tokens(Corpus, what="word",remove_punct = TRUE,remove_numbers = TRUE,
  remove_symbols = TRUE, remove_separators = TRUE,
  remove_twitter = TRUE, remove_hyphens = TRUE, remove_url = TRUE,
  ngrams = 1, skip = 0L, concatenator = "_") 
toks<- tokens_remove(toks, stopwords('french'), valuetype = 'fixed', padding = TRUE)

#dfmlex<-dfm(toks, tolower = TRUE,stem=TRUE)
lexdiv <- textstat_lexdiv(toks,measure = c("TTR","C","Maas"), MATTR_window = 20L) 

comment_X<-comment_W %>% select(Hotel,Username,Titre,Commetaire,Note,note_avis,prix,chambres,Taille_hotel,prix_classe,destination,redacteur,nb_comment.y,nbcar.x, ARI,Coleman.Liau.grade,meanSentenceLength,meanWordSyllables) 

comment_X<-cbind(comment_X,lexdiv)

L’analyse des corrélation montre que 1) la longueur des avis est lié négativement au TTR et au maas, ce qui correspond à la critique de ces indices qui ne peuvent rendre comparables des textes de longueur très différentes. 2) on note que le TTR est aussi fortement lié au nombre de syllabes par mots, ce qui est logique dans la mesure où les mots les plus longs sont aussi les plus rares ( les plus fréquents et répétés sont les déterminant (ce), les coordination ( où, ou, et)). 3) le C est lié au mass, independant de la longueur du texte, sensible à la longueur des mots. Il semble être l’indicateur le plus appropriés.

La lisibilité est lié positivement à la diversité lexicale. L’usage de terme précis contribue sans doute à rendre le texte plus clairs, un texte simple favorise l’usage de terme précis.

Sentiment

La distribution du sentiment : NRC

On utilise le package syuzhet et en particulier le dictionnaire “nrc” developpé et traduit par @mohammad_crowdsourcing_2013 ( Index Feel)

library(syuzhet)             #analyse du sentimeent

#paramétres
method <- "nrc"
lang <- "french"
phrase<-as.character(paste0(comment_W$Titre,". ",comment_W$Commetaire))
#extraction
my_text_values_french<- get_sentiment(phrase, method=method, language=lang)

Le sentiment est globalement plutét positif, méme si une fraction des contributions présentent des valeurs négatives. La variance est relativement élevée, ce qui est le signe d’une certaine sensibilité. Il se distribue plutét normalement au moins de maniére symétrique.

#ajout de la colonne sentiment au tableau de données des contributions:
sent<-as.data.frame(my_text_values_french)
sent$sentiment<-as.numeric(sent$my_text_values_french)

comment_X<-cbind(comment_X,sent)
#statistiques 
s_mean<-round(mean(comment_X$sentiment),2)
s_std<-round(sd(comment_X$sentiment),2)
#histogram
comment_X$quintile<-cut(comment_X$sentiment, breaks=c(-15,0,25))
ggplot(comment_X, aes(x=sentiment))+geom_histogram(binwidth=1,aes(fill=quintile))+theme_minimal()+xlim(-15,+30) +scale_fill_manual(values=Zissou)+ ggplot2::annotate("text", x=150, y=4.5, label= paste0("moyenne=",s_mean,"ecart type",s_std))

Mais Un indicateur dépendant de la longueur du texte

En corrélant le nombre de caractéres et le score primaire de sentiment une corrélation nette apparait, elle est de l’ordre de 0.56, elle s’atténue quand la taille du texte déapsse les 700 caractéres. Quand on corréle au score de sentiment standardisé ( on divise par 500, la mediane de la longueur des texte), c’est une relation inverse qui apparait, méme si elle est moins forte ( r= 0.26) , plus le texte est long est plus il est neutre, mais prudence, neutre en moyenne, pas forcément en qualité d’expression. On s’aprrçoit aussi que la note est faiblement corrélée avec le sentiment. S’il vont dans le même sens, c’est avec une grande variations. Soit les notes ne dise pas ce que les gens ressentent, soit la mesure du sentiment est imparfaite.

La transformation opérée montre une relation linéaire claire dans le segment [-10,10], au-delé le score de sentiment devient invariant avec le grand nombre de mentions négatives qui résulte d’un comment_X long.

Evolution du sentiment

### les corrélats du sentiment

les modèles montrent que les déterminants sont relatifs au produit ( taille de l’hotel avec un avantage à ceux de taille intermédiaires), la longueur du texte. nile prix ni l’expérience du scripteur ne sont significatif et la prise en compte de la période et de la destination ne semble être important.

## 
## ===================================================================
##                                         Dependent variable:        
##                                 -----------------------------------
##                                             sent_score             
##                                     (1)         (2)         (3)    
## -------------------------------------------------------------------
## Note                             1.498***    1.503***    1.509***  
##                                   (0.034)     (0.034)     (0.034)  
##                                                                    
## nbcar.x                          -0.002***   -0.002***   -0.002*** 
##                                  (0.00005)   (0.00005)   (0.00005) 
##                                                                    
## log(prix)                                      0.059      -0.011   
##                                               (0.123)     (0.132)  
##                                                                    
## nb_comment.y                                  -0.008      -0.009   
##                                               (0.011)     (0.011)  
##                                                                    
## Taille_hotel 6-15 chambres                     0.173       0.233   
##                                               (0.224)     (0.226)  
##                                                                    
## Taille_hotel15-50 chambres                   1.281***    1.309***  
##                                               (0.229)     (0.237)  
##                                                                    
## Taille_hotel50-80 chambres                    0.576**     0.634**  
##                                               (0.282)     (0.303)  
##                                                                    
## Taille_hotel80 et plus chambres               0.853**     1.048**  
##                                               (0.403)     (0.435)  
##                                                                    
## Year2016                                                  -0.109   
##                                                           (0.087)  
##                                                                    
## Year2017                                                  -0.144   
##                                                           (0.092)  
##                                                                    
## Year2018                                                  0.222**  
##                                                           (0.104)  
##                                                                    
## Year2019                                                   0.050   
##                                                           (0.130)  
##                                                                    
## destinationBora Bora                                       0.054   
##                                                           (0.622)  
##                                                                    
## destinationIles vent                                       0.018   
##                                                           (0.585)  
##                                                                    
## destinationMarquises                                       0.035   
##                                                           (0.652)  
##                                                                    
## destinationMoorea                                          0.153   
##                                                           (0.599)  
##                                                                    
## destinationTahiti                                         -0.343   
##                                                           (0.606)  
##                                                                    
## destinationTuamotu                                        -0.585   
##                                                           (0.595)  
##                                                                    
## Constant                         -0.859***    -1.716      -0.937   
##                                   (0.172)     (1.192)     (1.386)  
##                                                                    
## -------------------------------------------------------------------
## Observations                      26,467      25,947      25,947   
## Log Likelihood                  -78,735.830 -77,184.480 -77,177.450
## Akaike Inf. Crit.               157,481.700 154,390.900 154,396.900
## Bayesian Inf. Crit.             157,522.600 154,480.800 154,568.300
## ===================================================================
## Note:                                   *p<0.1; **p<0.05; ***p<0.01

Positivity et negativity - nrc

Le méme outil fournit un autre systéme d’annotations qui compte les mentions d’éléments positifs ou négatifs, ainsi que d’émotions définies sur la base de l’inventaire de @plutchik_psychoevolutionary_1982 on utilise simplement la fonction get_nrc_sentiment, en précisant le dictionnaire adéquat. L’échelle comprend en fait deux éléments : les 8 émotion de base *au sens de pluchik, et deux indicateurs de polarité. L’opérationnalisation réalisée par @mohammad_crowdsourcing_2013 s’inscrit dans une tradition de la recherche en marketing, se souvenir de @havlena_varieties_1986 et de @westbrook_dimensionality_1991.

emotions <- get_nrc_sentiment(phrase,language = "french")

On s’intéresse d’surtout aux mentions positives et négatives ( les émotions c’est pour plus tard. (la mesure permet ainsi une dissymétrie des deux polarités, il y a le bien, le mal, le mal et le bien, mais aussi si qui n’est ni mal ni bien). Les textes étant inégaux en taille on va ramener l’indicateur de polarité au nombre de caractéres (sur une base de 500 c) de chaque contribution. En effet l’algo compte les valence et leur intensité est proportionnel é la longueur du texte. Ce qui est clairement démontré par la seconde figure. A partir de ces deux mesures, 4 indicateurs peuvent étre construits * Positivité : nombre de termes positifs pour 500 signes. * Négativitivé : nombre de termes négatifs pour 500 signes. * Valence : rapport du nombre de termes positifs sur les négatifs. * Expressivité : nombre de termes positifs et négatifs. le dernier graphe nous apprend que les jugements plutôt positifs sont aussi les plus expressifs. La “froideur” des avis les plus négatifs refléte-t-elle une crainte de la désaprobation sociale. C’est une piste de recherche à poursuivre, on pourrait s’attendre à ce que les avis les plus négatifs surgissent plus facilement si la densité des négatives est plus importante et observer une sorte d’autocorrélation.

polarity<-subset(emotions,select=c(positive, negative))
comment_X<-cbind(comment_X,polarity)
sum(comment_X$positive)

## [1] 238331

sum(comment_X$negative)

## [1] 104722

G1<-ggplot(comment_X, aes(x=positive))+geom_histogram(binwidth = 1, fill="darkred")+xlim(-1,40)+ylim(0,3000)+theme_minimal()+ ggplot2::annotate("text", x=0, y=350, size=2.5,label= paste0("n=",sum(comment_X$positive)))
G2<-ggplot(comment_X, aes(x=negative))+geom_histogram(binwidth = 1,fill="blue4")+xlim(-1,40)+ylim(0,3000)+theme_minimal()+ ggplot2::annotate("text", x=0, y=350, size=2.5,label= paste0("n=",sum(comment_X$negative)))
grid.arrange(G1,G2,ncol=2)

La relation entre le nombre de mentions et la taille du texte est évidente et de l’ordre de 0.75.

rp<-round(cor(comment_X$nbcar.x, comment_X$positive),2)
rn<-round(cor(comment_X$nbcar.x, comment_X$negative),2)


G09a<-ggplot(comment_X, aes(x=nbcar.x,y=positive ))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+xlim(0,5000)+theme_minimal()+ylim(0,30)+ ggplot2::annotate("text", x=0, y=30, size=2.5,label= paste0("n=",rp))
G09b<-ggplot(comment_X, aes(x=nbcar.x,y=negative ))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+xlim(0,5000)+theme_minimal() +ylim(0,30)+ ggplot2::annotate("text", x=0, y=30, size=2.5,label= paste0("n=",rn))
grid.arrange(G09a,G09b,ncol=2)

L’idée de transformer cet indicateur brut en le rapportant à un méme nombre de caractéres (500 caractères ou environ un avis median) est donc justifiée. On observe une symétrie l’effet de la valence sur l’expressivité, méme si une contribution plus forte de la négativité. Négativité et positivité ne sont pas corrélées.

comment_X$positivity<-(comment_X$positive*500)/(comment_X$nbcar.x)
comment_X$negativity<-(comment_X$negative*500)/(comment_X$nbcar.x)
comment_X$valence<-comment_X$positivity-comment_X$negativity
comment_X$expressivity<-comment_X$positivity+comment_X$negativity

G11<-ggplot(comment_X, aes(x=valence,y=expressivity ))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+theme_minimal()
G12<-ggplot(comment_X, aes(x=negativity,y=positivity ))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+theme_minimal()
G13<-ggplot(comment_X, aes(x=negativity,y= expressivity))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+theme_minimal()
G14<-ggplot(comment_X, aes(x=positivity,y= expressivity))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+theme_minimal()
grid.arrange(G11,G12,G13,G14,ncol=2)

On fait les comparaisons ci-dessous

Mesure de l’expérience

le LIWC permet d’obtenir d’autres indicateurs du sentiment, une partie des 80 indicateurs proposés est relatif à des dimensions topicales dont trois groupes vont retenir notre attention dans la mesure où ils décrivent une partie de l’expérience relatée dans les commentaires. * La sensorialité ( voir, entendre, sentir) * L’orientation temporelle ( passé, présent, futur) * les émotions négatives (tristesse, colére, )

La procédure pour extraire ces notions est fort simple :

# the devtools package needs to be installed for this to work
#devtools::install_github("kbenoit/quanteda.dictionaries")

library("quanteda.dictionaries")
dict_liwc_french <- dictionary(file = "FrenchLIWCDictionary.dic",
                             format = "LIWC")
test<-liwcalike(comment_X$Commetaire,dictionary = dict_liwc_french)
comment_X<-cbind(comment_X,test)

Sentiment LIWC

G15a<-ggplot(comment_X, aes(x=negativity,y=émonég ))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+theme_minimal()
G15b<-ggplot(comment_X, aes(x=positivity,y=émopos ))+geom_point(color="grey")+geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"))+theme_minimal()
grid.arrange(G15a,G15b,ncol=2)

Annotations

text<-paste0(comment_X$Titre,". ",comment_X$Commetaire)
cnlp_init_udpipe(model_name = "french")
#obj <- cnlp_annotate(text, as_strings = TRUE)
#saveRDS(obj,"obj2018.rds")
obj<-readRDS(file="obj2018.rds")

Vocab1<-subset(Vocab, upos=="NOUN")
Table <- with(Vocab1, table(lemma))
ling<-as.data.frame(Table) %>% filter(Freq>3000)
g2<-ggplot(ling,aes(x=reorder(lemma,Freq),y=Freq))+geom_bar(stat="identity",fill="brown1")+coord_flip()+theme_minimal()+theme_minimal()+ theme(axis.title.x=element_blank())+ theme(axis.title.y=element_blank())+labs(title = "Noms communs",x="Noms commun",y="nombre d'avis")

Vocab2<-subset(Vocab, upos=="ADJ" | upos=="ADV")
Table <- with(Vocab2, table(lemma))
ling<-as.data.frame(Table) %>% filter(Freq>3000)
g3<-ggplot(ling,aes(x=reorder(lemma,Freq),y=Freq))+geom_bar(stat="identity",fill="brown2")+coord_flip()+theme_minimal()+theme(text = element_text(size=3))+theme_minimal()+ theme(axis.title.y=element_blank())+labs(title = "Adjectif ",x="Adverbe et adjectifs",y="nombre d'avis")

Vocab3<-subset(Vocab, upos=="ADV")
Table <- with(Vocab3, table(lemma))
ling<-as.data.frame(Table) %>% filter(Freq>1000)
g4<-ggplot(ling,aes(x=reorder(lemma,Freq),y=Freq))+geom_bar(stat="identity",fill="brown4")+coord_flip()+theme_minimal()+theme(text = element_text(size=9), label = NULL)+theme_minimal()+ theme(axis.title.y=element_blank(),axis.text=element_text(size=9))+labs(title = " Verbes",x="verbes",y="nombre d'avis")
grid.arrange(g2,g3,g4,ncol=3)

LDA

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

tf <- cnlp_get_token(obj) %>%
  filter(upos %in% c("ADJ", "NOUN","VERB")) %>%
  cnlp_utils_tfidf(min_df = 0.05, max_df = 0.95, type = "tf", tf_weight = "raw")

lda_model = LDA$new(n_topics = 8, doc_topic_prior = 0.1, topic_word_prior = 0.01)
set.seed(67)
doc_topic_distr = 
  lda_model$fit_transform(x = tf, n_iter = 1000, 
                          convergence_tol = 0.001, n_check_convergence = 25, 
                          progressbar = FALSE)

## INFO [2019-10-09 19:31:01] iter 25 loglikelihood = -3240004.542
## INFO [2019-10-09 19:31:02] iter 50 loglikelihood = -3166531.752
## INFO [2019-10-09 19:31:03] iter 75 loglikelihood = -3128457.471
## INFO [2019-10-09 19:31:04] iter 100 loglikelihood = -3107996.870
## INFO [2019-10-09 19:31:05] iter 125 loglikelihood = -3093352.427
## INFO [2019-10-09 19:31:06] iter 150 loglikelihood = -3084199.739
## INFO [2019-10-09 19:31:07] iter 175 loglikelihood = -3077449.440
## INFO [2019-10-09 19:31:07] iter 200 loglikelihood = -3071756.293
## INFO [2019-10-09 19:31:08] iter 225 loglikelihood = -3068214.496
## INFO [2019-10-09 19:31:09] iter 250 loglikelihood = -3066153.144
## INFO [2019-10-09 19:31:09] early stopping at 250 iteration

#description des topic en fonction d'un degré de pertinence de lamba ( lambda =1 probabilités)
lda_res<-as.data.frame(lda_model$get_top_words(n = 15, lambda = 0.30))
lda_res$rank<-as.numeric(row.names(lda_res))
lda_res<-melt(lda_res,id.vars = c("rank"))
ggplot(lda_res, aes(x=variable, y= rank, group =  value , label = value)) + scale_y_reverse() + geom_text(aes(color=variable,size=sqrt(26-rank)))+theme_minimal()+scale_color_hue()+guides(color=FALSE,size=FALSE)+labs(x="topics", y="par ordre de pertinence")

#library(LDAvis)
lda_model$plot()
topic<- as.data.frame(doc_topic_distr)
comment_X<-cbind(comment_X,topic)

comment_X$dithyrambe<-comment_X$V1
comment_X$Chaleurrelation<-comment_X$V2
comment_X$rapportqltepx<-comment_X$V3
comment_X$transit<-comment_X$V4
comment_X$interactionclient<-comment_X$V5
comment_X$motulife<-comment_X$V6
comment_X$cartepostale<-comment_X$V7
comment_X$potentialite<-comment_X$V8

Corrélation des Indicateurs de sentiment

pour reprendre ceux obtenus par LIWC

#C,K, Maas,positivity,negativity, expressivity,sent_score
M<-subset(comment_X, select=c(meanSentenceLength , meanWordSyllables,nbcar.x,WPS,WC,ARI, Coleman.Liau.grade,TTR,C,Maas,sent_score,positivity,émopos,negativity,émonég,verbepassé,verbefutur,anxiété,colère,tristesse,entendre, sentir, voir))
M <- cor(M)
library(corrplot)
corrplot(M, type="lower")

M<-subset(comment_X, select=c(sent_score,positivity,negativity,émopos,émonég,anxiété,colère,tristesse))
M <- cor(M)
corrplot.mixed(M)

Comparaison par hôtels, Iles et scripteurs

Les instruments étant construits on teste systématiquement les 5V avec les 6 variables clés * La destination * La période (année) * La taille de l’hôtel ( nbe de chambre) * le prix de référence * l’expérience du scripteur * la note donnée par le scripteu

Production verbale

#nombre de phrases : Sentence per doc
comment_X$SPD<-comment_X$WC/comment_X$WPS
foo1<- aggregate(cbind(nbcar.x,WC,SPD)~Taille_hotel ,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g15a<-ggplot(foo1, aes(x=Taille_hotel, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Production textuelle : l'effet de taille", x="taille des hôtels", y="moyenne",caption = "")+ theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo2<- aggregate(cbind(nbcar.x,WC,SPD)~prix_classe ,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g15b<-ggplot(foo2, aes(x=prix_classe, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Production textuelle : l'effet du prix", x="Gamme de prix", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo3<- aggregate(cbind(nbcar.x,WC,SPD)~Year ,data=comment_X,FUN="mean")
foo3<-melt(foo3)
g15c<-ggplot(foo3, aes(x=Year, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Production textuelle : effet du temps", x="Gamme de prix", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo4<- aggregate(cbind(nbcar.x,WC,SPD)~destination ,data=comment_X,FUN="mean")
foo4<-melt(foo4)
g15d<-ggplot(foo4, aes(x=destination, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Production textuelle : l'effet des Iles", x="Iles", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo5<- aggregate(cbind(nbcar.x,WC,SPD)~redacteur ,data=comment_X,FUN="mean")
foo5<-melt(foo5)
g15e<-ggplot(foo5, aes(x=redacteur, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Production textuelle : l'effet de l'expérience critiques", x="Nb d'avis écrits", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo6<- aggregate(cbind(nbcar.x,WC,SPD)~note_avis ,data=comment_X,FUN="mean")
foo6<-melt(foo6)
g15f<-ggplot(foo6, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Production textuelle : l'effet du jugement", x="Note", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

grid.arrange(g15a, g15b, ncol=2)

grid.arrange(g15c, g15d, ncol=2)

grid.arrange(g15e, g15f, ncol=2)

anova_divlex01<- lm(nbcar.x ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Nombre de caractères") )
myft

Nombre de caractères
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	242.842	0.000	0.097	1.000
destination	6.000	4.174	0.000	0.031	0.980
Year	4.000	45.568	0.000	0.084	1.000
Taille_hotel	4.000	8.995	0.000	0.037	0.999
prix_classe	4.000	55.898	0.000	0.093	1.000
redacteur	3.000	11.422	0.000	0.036	1.000
note_avis	4.000	175.277	0.000	0.164	1.000
Residuals	25921.000

anova_divlex01<- lm(WC ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("nombre de mots") )
myft

nombre de mots
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	244.174	0.000	0.097	1.000
destination	6.000	4.129	0.000	0.031	0.979
Year	4.000	46.682	0.000	0.085	1.000
Taille_hotel	4.000	8.581	0.000	0.036	0.999
prix_classe	4.000	53.239	0.000	0.091	1.000
redacteur	3.000	10.465	0.000	0.035	0.999
note_avis	4.000	192.924	0.000	0.173	1.000
Residuals	25921.000

anova_divlex01<- lm(SPD ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "statistic", "df", "p.value","cohens.f","power")) 
myft<- add_header_lines(myft, values =c("Expressivité") )
myft

Expressivité
term	statistic	df	p.value	cohens.f	power
(Intercept)	177.670	1.000	0.000	0.083	1.000
destination	3.775	6.000	0.001	0.030	0.966
Year	64.533	4.000	0.000	0.100	1.000
Taille_hotel	5.247	4.000	0.000	0.028	0.971
prix_classe	40.835	4.000	0.000	0.079	1.000
redacteur	39.226	3.000	0.000	0.067	1.000
note_avis	51.307	4.000	0.000	0.089	1.000
Residuals		25921.000

foo1<- aggregate(cbind(sent_score,nbcar.x,WC,SPD)~destination ,data=comment_X,FUN="mean") %>% mutate(categorie=destination)%>% dplyr::select(-destination)
foo2<- aggregate(cbind(sent_score,nbcar.x,WC,SPD)~Year ,data=comment_X,FUN="mean") %>% mutate(categorie=Year)%>% dplyr::select(-Year)
foo3<- aggregate(cbind(sent_score,nbcar.x,WC,SPD)~Taille_hotel ,data=comment_X,FUN="mean") %>% mutate(categorie=Taille_hotel)%>% dplyr::select(-Taille_hotel)
foo4<- aggregate(cbind(sent_score,nbcar.x,WC,SPD)~prix_classe ,data=comment_X,FUN="mean") %>% mutate(categorie=prix_classe)%>% dplyr::select(-prix_classe)
foo5<- aggregate(cbind(sent_score,nbcar.x,WC,SPD)~redacteur ,data=comment_X,FUN="mean") %>% mutate(categorie=redacteur)%>% dplyr::select(-redacteur)
foo6<- aggregate(cbind(sent_score,nbcar.x,WC,SPD)~note_avis ,data=comment_X,FUN="mean") %>% mutate(categorie=note_avis)%>% dplyr::select(-note_avis)

foo<-rbind(foo1,foo2,foo3,foo4,foo5,foo6)
myft01 <- flextable(foo, col_keys = c("categorie","nbcar.x", "WC", "SPD"))
myft01

categorie	nbcar.x	WC	SPD
Australes	612.537	116.381	5.127
Bora Bora	829.102	159.254	7.666
Iles vent	610.908	116.588	5.350
Marquises	601.051	113.562	5.014
Moorea	752.109	143.980	6.891
Tahiti	655.988	126.440	5.913
Tuamotu	671.533	129.114	5.822
2015	731.162	139.590	6.603
2016	740.539	142.424	6.763
2017	716.858	137.809	6.511
2018	605.628	115.738	5.218
2019	572.714	109.362	4.925
1-5 chambres	631.840	121.482	5.573
6-15 chambres	605.122	115.424	5.229
15-50 chambres	601.004	115.427	5.362
50-80 chambres	785.367	150.698	7.254
80 et plus chambres	776.325	148.496	6.968
<10 000	597.703	115.096	5.287
<15 000	598.508	115.317	5.407
<25 000	641.079	123.584	5.660
<40 000	605.765	115.780	5.328
>40 000	826.350	158.137	7.551
1 avis	727.583	139.811	6.821
2 ou 3 avis	721.256	138.313	6.652
4 à 8 avis	655.316	125.765	5.706
9 et plus avis	700.138	133.048	5.302
1	1054.900	208.400	8.032
2	933.012	183.278	7.257
3	816.016	158.217	6.656
4	660.355	126.256	5.867
5	665.300	126.929	6.299

Comparaison des sentiments

on reprend les comparaisons par catégories :

foo1<- aggregate(cbind(positivity,émopos,negativity,émonég,expressivity)~Taille_hotel ,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g15a<-ggplot(foo1, aes(x=Taille_hotel, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : l'effet de taille", x="taille des hôtels", y="moyenne",caption = "")+ theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo2<- aggregate(cbind(positivity,émopos,negativity,émonég,expressivity)~prix_classe ,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g15b<-ggplot(foo2, aes(x=prix_classe, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : l'effet du prix", x="Gamme de prix", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo3<- aggregate(cbind(positivity,émopos,negativity,émonég,expressivity)~Year ,data=comment_X,FUN="mean")
foo3<-melt(foo3)
g15c<-ggplot(foo3, aes(x=Year, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : effet du temps", x="Gamme de prix", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo4<- aggregate(cbind(positivity,émopos,negativity,émonég,expressivity)~destination ,data=comment_X,FUN="mean")
foo4<-melt(foo4)
g15d<-ggplot(foo4, aes(x=destination, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : l'effet des Iles", x="Iles", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo5<- aggregate(cbind(positivity,émopos,negativity,émonég,expressivity)~redacteur ,data=comment_X,FUN="mean")
foo5<-melt(foo5)
g15e<-ggplot(foo5, aes(x=redacteur, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : l'effet de l'expérience critiques", x="Nb d'avis écrits", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo6<- aggregate(cbind(positivity,émopos,negativity,émonég,expressivity)~note_avis ,data=comment_X,FUN="mean")
foo6<-melt(foo6)
g15f<-ggplot(foo6, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : l'effet du jugement", x="Note", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

grid.arrange(g15a, g15b, ncol=2)

grid.arrange(g15c, g15d, ncol=2)

grid.arrange(g15e, g15f, ncol=2)

#library(sjstats)
#library(kableExtra)
anova_divlex01<- lm(sent_score ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Sentiment général, FEEL") )
myft

Sentiment général, FEEL
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	0.577	0.448	0.005	0.118
destination	6.000	8.779	0.000	0.045	1.000
Year	4.000	14.126	0.000	0.047	1.000
Taille_hotel	4.000	36.189	0.000	0.075	1.000
prix_classe	4.000	19.252	0.000	0.055	1.000
redacteur	3.000	0.458	0.712	0.007	0.143
note_avis	4.000	684.174	0.000	0.325	1.000
Residuals	25921.000

anova_divlex01<- lm(positivity ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Positivité FEEL") )
myft

Positivité FEEL
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	99.428	0.000	0.062	1.000
destination	6.000	4.697	0.000	0.033	0.990
Year	4.000	31.428	0.000	0.070	1.000
Taille_hotel	4.000	42.369	0.000	0.081	1.000
prix_classe	4.000	23.088	0.000	0.060	1.000
redacteur	3.000	3.942	0.008	0.021	0.834
note_avis	4.000	388.989	0.000	0.245	1.000
Residuals	25921.000

anova_divlex01<- lm(émopos ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "statistic", "df", "p.value","cohens.f","power")) 
myft<- add_header_lines(myft, values =c("Emotions positives LIWC") )
myft

Emotions positives LIWC
term	statistic	df	p.value	cohens.f	power
(Intercept)	117.248	1.000	0.000	0.067	1.000
destination	22.378	6.000	0.000	0.072	1.000
Year	14.829	4.000	0.000	0.048	1.000
Taille_hotel	7.300	4.000	0.000	0.034	0.996
prix_classe	24.363	4.000	0.000	0.061	1.000
redacteur	4.309	3.000	0.005	0.022	0.869
note_avis	679.522	4.000	0.000	0.324	1.000
Residuals		25921.000

anova_divlex01<- lm(negativity ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("négativité") )
myft

négativité
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	380.048	0.000	0.121	1.000
destination	6.000	29.667	0.000	0.083	1.000
Year	4.000	7.200	0.000	0.033	0.996
Taille_hotel	4.000	22.781	0.000	0.059	1.000
prix_classe	4.000	19.084	0.000	0.054	1.000
redacteur	3.000	21.403	0.000	0.050	1.000
note_avis	4.000	293.420	0.000	0.213	1.000
Residuals	25921.000

anova_divlex01<- lm(émonég ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "statistic", "df", "p.value","cohens.f","power")) 
myft<- add_header_lines(myft, values =c("Emotion négative") )
myft

Emotion négative
term	statistic	df	p.value	cohens.f	power
(Intercept)	330.006	1.000	0.000	0.113	1.000
destination	5.112	6.000	0.000	0.034	0.995
Year	2.478	4.000	0.042	0.020	0.712
Taille_hotel	1.235	4.000	0.294	0.014	0.392
prix_classe	9.840	4.000	0.000	0.039	1.000
redacteur	3.413	3.000	0.017	0.020	0.772
note_avis	809.528	4.000	0.000	0.353	1.000
Residuals		25921.000

anova_divlex01<- lm(expressivity ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "statistic", "df", "p.value","cohens.f","power")) 
myft<- add_header_lines(myft, values =c("Expressivité") )
myft

Expressivité
term	statistic	df	p.value	cohens.f	power
(Intercept)	314.620	1.000	0.000	0.110	1.000
destination	12.201	6.000	0.000	0.053	1.000
Year	36.212	4.000	0.000	0.075	1.000
Taille_hotel	39.177	4.000	0.000	0.078	1.000
prix_classe	24.762	4.000	0.000	0.062	1.000
redacteur	14.920	3.000	0.000	0.042	1.000
note_avis	79.220	4.000	0.000	0.111	1.000
Residuals		25921.000

foo1<- aggregate(cbind(sent_score,positivity,émopos,negativity,émonég,expressivity)~destination ,data=comment_X,FUN="mean") %>% mutate(categorie=destination)%>%  dplyr::select(-destination)
foo2<- aggregate(cbind(sent_score,positivity,émopos,negativity,émonég,expressivity)~Year ,data=comment_X,FUN="mean") %>% mutate(categorie=Year)%>%  dplyr::select(-Year)
foo3<- aggregate(cbind(sent_score,positivity,émopos,negativity,émonég,expressivity)~Taille_hotel ,data=comment_X,FUN="mean") %>% mutate(categorie=Taille_hotel)%>%  dplyr::select(-Taille_hotel)
foo4<- aggregate(cbind(sent_score,positivity,émopos,negativity,émonég,expressivity)~prix_classe ,data=comment_X,FUN="mean") %>% mutate(categorie=prix_classe)%>%  dplyr::select(-prix_classe)
foo5<- aggregate(cbind(sent_score,positivity,émopos,negativity,émonég,expressivity)~redacteur ,data=comment_X,FUN="mean") %>% mutate(categorie=redacteur)%>%  dplyr::select(-redacteur)
foo6<- aggregate(cbind(sent_score,positivity,émopos,negativity,émonég,expressivity)~note_avis ,data=comment_X,FUN="mean") %>% mutate(categorie=note_avis)%>%  dplyr::select(-note_avis)

foo<-rbind(foo1,foo2,foo3,foo4,foo5,foo6)
myft01 <- flextable(foo, col_keys = c("categorie", "sent_score", "positivity","émopos", "negativity","émonég","expressivity"))
myft01

categorie	sent_score	positivity	émopos	negativity	émonég	expressivity
Australes	5.057	8.437	7.036	3.380	0.804	11.818
Bora Bora	4.851	7.397	6.664	2.546	1.058	9.942
Iles vent	5.392	8.311	7.303	2.919	0.928	11.231
Marquises	5.092	7.933	7.217	2.841	0.983	10.774
Moorea	5.010	7.819	6.936	2.809	1.074	10.628
Tahiti	3.935	7.614	6.073	3.679	1.189	11.293
Tuamotu	4.506	7.578	7.026	3.072	1.127	10.650
2015	4.730	7.699	6.657	2.969	1.078	10.668
2016	4.622	7.561	6.733	2.939	1.057	10.500
2017	4.629	7.650	6.718	3.020	1.070	10.670
2018	5.086	8.325	6.981	3.239	1.071	11.564
2019	4.988	8.263	7.088	3.275	1.161	11.538
1-5 chambres	4.397	7.568	6.866	3.170	0.954	10.738
6-15 chambres	4.954	8.177	7.129	3.223	0.962	11.400
15-50 chambres	5.165	8.474	6.938	3.309	1.144	11.783
50-80 chambres	4.708	7.577	6.640	2.869	1.137	10.446
80 et plus chambres	4.764	7.547	6.546	2.783	1.120	10.330
<10 000	4.027	7.830	6.589	3.803	0.922	11.632
<15 000	3.995	7.719	6.361	3.725	1.080	11.444
<25 000	4.450	7.589	6.823	3.139	1.065	10.728
<40 000	5.319	8.390	7.087	3.072	1.068	11.462
>40 000	4.944	7.559	6.792	2.615	1.091	10.174
1 avis	4.910	7.670	6.912	2.760	1.016	10.431
2 ou 3 avis	4.771	7.702	6.732	2.931	1.048	10.633
4 à 8 avis	4.639	7.904	6.785	3.265	1.127	11.169
9 et plus avis	4.630	8.017	6.708	3.388	1.135	11.405
1	-0.439	4.482	3.498	4.921	2.636	9.403
2	0.261	4.740	3.997	4.479	2.226	9.220
3	2.109	6.196	5.007	4.086	1.654	10.282
4	4.712	7.920	6.876	3.207	1.083	11.127
5	5.801	8.365	7.409	2.564	0.811	10.929

comparaison lisibilité

foo<- aggregate(cbind(ARI,Coleman.Liau.grade)~destination,data=comment_X,FUN="mean")
foo<-melt(foo)
g07d<-ggplot(foo, aes(x=destination, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet de destination", x="destination", y="moyenne",caption = "")

foo<- aggregate(cbind(ARI,Coleman.Liau.grade)~Year,data=comment_X,FUN="mean")
foo<-melt(foo)
g07c<-ggplot(foo, aes(x=Year, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet de la note", x="Nombre d'avis par scripteur", y="moyenne",caption = "")

foo<- aggregate(cbind(ARI,Coleman.Liau.grade)~Taille_hotel,data=comment_X,FUN="mean")
foo<-melt(foo)
g07a<-ggplot(foo, aes(x=Taille_hotel, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+labs(title = "Lisibilité : l'effet de taille", x="taille des hôtels", y="moyenne",caption = "")+ theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))

foo<- aggregate(cbind(ARI,Coleman.Liau.grade)~prix_classe,data=comment_X,FUN="mean")
foo<-melt(foo)
g07b<-ggplot(foo, aes(x=prix_classe, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet du prix", x="Gamme de prix", y="moyenne",caption = "")

foo<- aggregate(cbind(ARI,Coleman.Liau.grade)~redacteur,data=comment_X,FUN="mean")
foo<-melt(foo)
g07e<-ggplot(foo, aes(x=redacteur, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet du scripteur", x="Nombre d'avis par scripteur", y="moyenne",caption = "")

foo<- aggregate(cbind(Coleman.Liau.grade)~note_avis,data=comment_X,FUN="mean")
foo<-melt(foo)
g07f<-ggplot(foo, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet de notation", x="destination", y="moyenne",caption = "")
g07f

grid.arrange(g07a,g07b,ncol=2)

grid.arrange(g07c,g07d,ncol=2)

grid.arrange(g07e,g07f,ncol=2)

foo1<- aggregate(cbind(ARI,Coleman.Liau.grade)~destination ,data=comment_X,FUN="mean") %>% mutate(categorie=destination)%>% dplyr::select(-destination)
foo2<- aggregate(cbind(ARI,Coleman.Liau.grade)~Year ,data=comment_X,FUN="mean") %>% mutate(categorie=Year) %>%  dplyr::select(-Year)
foo3<- aggregate(cbind(ARI,Coleman.Liau.grade)~Taille_hotel ,data=comment_X,FUN="mean") %>% mutate(categorie=Taille_hotel)%>% dplyr::select(-Taille_hotel)
foo4<- aggregate(cbind(ARI,Coleman.Liau.grade)~prix_classe ,data=comment_X,FUN="mean") %>% mutate(categorie=prix_classe)%>% dplyr::select(-prix_classe)
foo5<- aggregate(cbind(ARI,Coleman.Liau.grade)~redacteur ,data=comment_X,FUN="mean") %>% mutate(categorie=redacteur)%>% dplyr::select(-redacteur)
foo6<- aggregate(cbind(ARI,Coleman.Liau.grade)~note_avis ,data=comment_X,FUN="mean") %>% mutate(categorie=note_avis)%>% dplyr::select(-note_avis)

foo<-rbind(foo1,foo2,foo3,foo4,foo5,foo6)
myft02 <- flextable(foo, col_keys = c("categorie","ARI", "Coleman.Liau.grade"))
myft02

categorie	ARI	Coleman.Liau.grade
Australes	14.379	11.776
Bora Bora	12.834	10.915
Iles vent	14.040	11.471
Marquises	15.182	11.889
Moorea	12.924	11.047
Tahiti	13.024	10.794
Tuamotu	14.192	11.310
2015	13.250	11.197
2016	12.766	10.883
2017	13.301	11.008
2018	14.278	11.427
2019	14.443	11.493
1-5 chambres	13.683	11.175
6-15 chambres	14.308	11.522
15-50 chambres	13.994	11.218
50-80 chambres	12.734	10.923
80 et plus chambres	12.802	10.951
<10 000	13.873	10.942
<15 000	13.301	11.016
<25 000	13.752	11.113
<40 000	14.153	11.300
>40 000	12.738	11.045
1 avis	12.353	10.919
2 ou 3 avis	12.623	10.998
4 à 8 avis	14.245	11.224
9 et plus avis	16.749	11.741
1	15.009	10.500
2	15.178	10.696
3	14.433	10.748
4	13.658	11.147
5	12.783	11.199

anova_01<- lm(Coleman.Liau.grade ~ Taille_hotel+prix_classe+destination+Year+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Coleman liau grade") )
myft

Coleman liau grade
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	1766.867	0.000	0.261	1.000
Taille_hotel	4.000	12.647	0.000	0.044	1.000
prix_classe	4.000	7.210	0.000	0.033	0.996
destination	6.000	15.533	0.000	0.060	1.000
Year	4.000	44.373	0.000	0.083	1.000
redacteur	3.000	46.436	0.000	0.073	1.000
note_avis	4.000	26.847	0.000	0.064	1.000
Residuals	25921.000

anova_01<- lm(ARI ~ Taille_hotel+prix_classe+destination+Year+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("ARI") )
myft

ARI
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	200.329	0.000	0.088	1.000
Taille_hotel	4.000	3.640	0.006	0.024	0.881
prix_classe	4.000	6.658	0.000	0.032	0.993
destination	6.000	4.636	0.000	0.033	0.989
Year	4.000	11.609	0.000	0.042	1.000
redacteur	3.000	86.057	0.000	0.100	1.000
note_avis	4.000	31.469	0.000	0.070	1.000
Residuals	25921.000

## Diversité lexicale

foo<- aggregate(cbind(TTR,C,Maas)~Taille_hotel ,data=comment_X,FUN="mean")
foo<-melt(foo)
g08a<-ggplot(foo, aes(x=Taille_hotel, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=3,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Diversité comment_Xicale : l'effet de taille", x="destination", y="moyenne",caption = "")
             
foo<- aggregate(cbind(TTR,C,Maas)~prix_classe,data=comment_X,FUN="mean")
foo<-melt(foo)
g08b<-ggplot(foo, aes(x=prix_classe, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=3,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Diversité lexicale : l'effet de prix", x="destination", y="moyenne",caption = "")


foo<- aggregate(cbind(TTR,C,Maas)~destination ,data=comment_X,FUN="mean")
foo<-melt(foo)
g08c<-ggplot(foo, aes(x=destination, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=3,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Diversité lexicale : l'effet de destination", x="destination", y="moyenne",caption = "")

foo<- aggregate(cbind(TTR,C,Maas)~Year ,data=comment_X,FUN="mean")
foo<-melt(foo)
g08d<-ggplot(foo, aes(x=Year, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=3,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Diversité lexicale : l'effet du temps", x="destination", y="moyenne",caption = "")

foo<- aggregate(cbind(TTR,C,Maas)~redacteur,data=comment_X,FUN="mean")
foo<-melt(foo)
g08e<-ggplot(foo, aes(x=redacteur, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet de scripteur", x="destination", y="moyenne",caption = "")

foo<- aggregate(cbind(TTR,C,Maas)~note_avis,data=comment_X,FUN="mean")
foo<-melt(foo)
g08f<-ggplot(foo, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet de notation", x="destination", y="moyenne",caption = "")

grid.arrange(g08a, g08b, ncol=2)

grid.arrange(g08c, g08d,ncol=2)

grid.arrange(g08e, g08f,ncol=2)

foo<- aggregate(cbind(TTR,C,Maas)~note_avis ,data=comment_X,FUN="mean")
foo<-melt(foo)
g08g<-ggplot(foo, aes(x=destination, y=value,group=variable))+geom_line(size=2,size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=3,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Diversité lexicale : l'effet de destination", x="destination", y="moyenne",caption = "")

foo1<- aggregate(cbind(TTR,C,Maas)~destination ,data=comment_X,FUN="mean") %>% mutate(categorie=destination)%>%  dplyr::select(-destination)
foo2<- aggregate(cbind(TTR,C,Maas)~Year ,data=comment_X,FUN="mean") %>% mutate(categorie=Year)%>%  dplyr::select(-Year)
foo3<- aggregate(cbind(TTR,C,Maas)~Taille_hotel ,data=comment_X,FUN="mean") %>% mutate(categorie=Taille_hotel)%>%  dplyr::select(-Taille_hotel)
foo4<- aggregate(cbind(TTR,C,Maas)~prix_classe ,data=comment_X,FUN="mean") %>% mutate(categorie=prix_classe)%>%  dplyr::select(-prix_classe)
foo5<- aggregate(cbind(TTR,C,Maas)~redacteur ,data=comment_X,FUN="mean") %>% mutate(categorie=redacteur)%>%  dplyr::select(-redacteur)
foo6<- aggregate(cbind(TTR,C,Maas)~note_avis ,data=comment_X,FUN="mean") %>% mutate(categorie=note_avis)%>%  dplyr::select(-note_avis)

foo<-rbind(foo1,foo2,foo3,foo4,foo5,foo6)
myft03 <- flextable(foo, col_keys = c("categorie","C", "Maas"))
myft03

categorie	C	Maas
Australes	0.870	0.260
Bora Bora	0.860	0.266
Iles vent	0.866	0.266
Marquises	0.866	0.266
Moorea	0.860	0.267
Tahiti	0.862	0.268
Tuamotu	0.865	0.264
2015	0.862	0.265
2016	0.861	0.267
2017	0.862	0.267
2018	0.866	0.266
2019	0.865	0.269
1-5 chambres	0.865	0.266
6-15 chambres	0.866	0.266
15-50 chambres	0.866	0.266
50-80 chambres	0.860	0.267
80 et plus chambres	0.858	0.268
<10 000	0.864	0.269
<15 000	0.864	0.267
<25 000	0.864	0.266
<40 000	0.865	0.267
>40 000	0.859	0.266
1 avis	0.860	0.269
2 ou 3 avis	0.860	0.268
4 à 8 avis	0.865	0.264
9 et plus avis	0.869	0.261
1	0.862	0.258
2	0.862	0.258
3	0.863	0.261
4	0.864	0.266
5	0.862	0.269

anova_divlex02<- lm(C ~Taille_hotel+prix_classe+destination+year+redacteur+note_avis,data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex02, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("C") )
myft

C
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	11.638	0.001	0.021	0.927
Taille_hotel	4.000	32.021	0.000	0.070	1.000
prix_classe	4.000	5.804	0.000	0.030	0.983
destination	6.000	6.354	0.000	0.038	0.999
year	1.000	49.031	0.000	0.043	1.000
redacteur	3.000	109.863	0.000	0.113	1.000
note_avis	4.000	6.192	0.000	0.031	0.989
Residuals	25924.000

anova_divlex01<- lm(Maas ~ Taille_hotel+prix_classe+destination+year+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Maas") )
myft

Maas
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	22.498	0.000	0.029	0.997
Taille_hotel	4.000	13.883	0.000	0.046	1.000
prix_classe	4.000	30.577	0.000	0.069	1.000
destination	6.000	7.036	0.000	0.040	1.000
year	1.000	30.865	0.000	0.035	1.000
redacteur	3.000	49.214	0.000	0.075	1.000
note_avis	4.000	93.286	0.000	0.120	1.000
Residuals	25924.000

On s’intéresse d’abord à l’expérience sensorielle

foo1<- aggregate(cbind(sentir,voir,entendre)~destination,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g20a<-ggplot(foo1, aes(x=destination, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()
foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~destination,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g20b<-ggplot(foo2, aes(x=destination, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()
foo<- aggregate(cbind(anxiété,colère,tristesse)~destination,data=comment_X,FUN="mean")
foo<-melt(foo)
g20c<-ggplot(foo, aes(x=destination, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()
grid.arrange(g20a,g20b,g20c, nrow=3)

foo1<- aggregate(cbind(sentir,voir,entendre)~Year,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g21a<-ggplot(foo1, aes(x=Year, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()
foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~Year,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g21b<-ggplot(foo2, aes(x=Year, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()
foo<- aggregate(cbind(anxiété,colère,tristesse)~Year,data=comment_X,FUN="mean")
foo<-melt(foo)
g21c<-ggplot(foo, aes(x=Year, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()
grid.arrange(g21a,g21b,g21c, nrow=3)

foo1<- aggregate(cbind(sentir,voir,entendre)~Taille_hotel,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g22a<-ggplot(foo1, aes(x=Taille_hotel, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()
foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~Taille_hotel,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g22b<-ggplot(foo2, aes(x=Taille_hotel, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()
foo<- aggregate(cbind(anxiété,colère,tristesse)~Taille_hotel,data=comment_X,FUN="mean")
foo<-melt(foo)
g22c<-ggplot(foo, aes(x=Taille_hotel, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()
grid.arrange(g22a,g22b,g22c, nrow=3)

foo1<- aggregate(cbind(sentir,voir,entendre)~prix_classe,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g23a<-ggplot(foo1, aes(x=prix_classe, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()
foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~prix_classe,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g23b<-ggplot(foo2, aes(x=prix_classe, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()
foo<- aggregate(cbind(anxiété,colère,tristesse)~prix_classe,data=comment_X,FUN="mean")
foo<-melt(foo)
g23c<-ggplot(foo, aes(x=prix_classe, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()
grid.arrange(g23a,g23b,g23c, nrow=3)

foo1<- aggregate(cbind(sentir,voir,entendre)~redacteur,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g24a<-ggplot(foo1, aes(x=redacteur, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()
foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~redacteur,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g24b<-ggplot(foo2, aes(x=redacteur, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()

foo<- aggregate(cbind(anxiété,colère,tristesse)~redacteur,data=comment_X,FUN="mean")
foo<-melt(foo)
g24c<-ggplot(foo, aes(x=redacteur, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()

grid.arrange(g24a,g24b,g24c, nrow=3)

foo1<- aggregate(cbind(sentir,voir,entendre)~note_avis,data=comment_X,FUN="mean")
foo1<-melt(foo1)
g25a<-ggplot(foo1, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()


foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~note_avis,data=comment_X,FUN="mean")
foo2<-melt(foo2)
g25b<-ggplot(foo2, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()

foo<- aggregate(cbind(anxiété,colère,tristesse)~note_avis,data=comment_X,FUN="mean")
foo<-melt(foo)
g25c<-ggplot(foo, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()
grid.arrange(g25a,g25b,g25c, nrow=3)

analyses de variance

anova_divlex01<- lm(sentir ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("sentir") )
myft

sentir
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	51.528	0.000	0.045	1.000
destination	6.000	10.214	0.000	0.049	1.000
Year	4.000	1.911	0.106	0.017	0.582
Taille_hotel	4.000	5.662	0.000	0.030	0.981
prix_classe	4.000	5.410	0.000	0.029	0.976
redacteur	3.000	1.051	0.369	0.011	0.287
note_avis	4.000	27.207	0.000	0.065	1.000
Residuals	25921.000

anova_divlex01<- lm(voir ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("voir") )
myft

voir
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	5.660	0.017	0.015	0.662
destination	6.000	18.088	0.000	0.065	1.000
Year	4.000	10.070	0.000	0.039	1.000
Taille_hotel	4.000	24.955	0.000	0.062	1.000
prix_classe	4.000	51.286	0.000	0.089	1.000
redacteur	3.000	22.955	0.000	0.052	1.000
note_avis	4.000	42.837	0.000	0.081	1.000
Residuals	25921.000

anova_divlex01<- lm(entendre ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("entendre") )
myft

entendre
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	32.106	0.000	0.035	1.000
destination	6.000	8.605	0.000	0.045	1.000
Year	4.000	2.795	0.025	0.021	0.770
Taille_hotel	4.000	2.939	0.019	0.021	0.793
prix_classe	4.000	4.470	0.001	0.026	0.942
redacteur	3.000	1.835	0.138	0.015	0.480
note_avis	4.000	8.306	0.000	0.036	0.999
Residuals	25921.000

anova_divlex01<- lm(verbepassé ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("verbepassé") )
myft

verbepassé
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	313.562	0.000	0.110	1.000
destination	6.000	1.207	0.299	0.017	0.484
Year	4.000	83.561	0.000	0.114	1.000
Taille_hotel	4.000	53.629	0.000	0.091	1.000
prix_classe	4.000	29.815	0.000	0.068	1.000
redacteur	3.000	155.970	0.000	0.134	1.000
note_avis	4.000	66.966	0.000	0.102	1.000
Residuals	25921.000

anova_divlex01<- lm(verbefutur ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("verbefutur") )
myft

verbefutur
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	26.094	0.000	0.032	0.999
destination	6.000	1.083	0.370	0.016	0.436
Year	4.000	0.170	0.954	0.005	0.086
Taille_hotel	4.000	1.227	0.297	0.014	0.389
prix_classe	4.000	6.716	0.000	0.032	0.993
redacteur	3.000	12.861	0.000	0.039	1.000
note_avis	4.000	10.270	0.000	0.040	1.000
Residuals	25921.000

anova_divlex01<- lm(verbeprésent ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("verbeprésent") )
myft

verbeprésent
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	415.837	0.000	0.127	1.000
destination	6.000	4.801	0.000	0.033	0.992
Year	4.000	24.052	0.000	0.061	1.000
Taille_hotel	4.000	2.737	0.027	0.021	0.760
prix_classe	4.000	6.202	0.000	0.031	0.989
redacteur	3.000	9.432	0.000	0.033	0.997
note_avis	4.000	31.955	0.000	0.070	1.000
Residuals	25921.000

anova_divlex01<- lm(anxiété ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Anxiété") )
myft

Anxiété
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	70.095	0.000	0.052	1.000
destination	6.000	0.988	0.431	0.015	0.398
Year	4.000	1.891	0.109	0.017	0.577
Taille_hotel	4.000	1.807	0.124	0.017	0.555
prix_classe	4.000	2.468	0.043	0.020	0.710
redacteur	3.000	0.611	0.608	0.008	0.179
note_avis	4.000	65.365	0.000	0.100	1.000
Residuals	25921.000

anova_divlex01<- lm(colère ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("colère") )
myft

colère
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	38.579	0.000	0.039	1.000
destination	6.000	2.996	0.006	0.026	0.910
Year	4.000	3.160	0.013	0.022	0.825
Taille_hotel	4.000	0.706	0.588	0.010	0.231
prix_classe	4.000	0.956	0.430	0.012	0.307
redacteur	3.000	2.078	0.101	0.016	0.535
note_avis	4.000	66.259	0.000	0.101	1.000
Residuals	25921.000

anova_divlex01<- lm(tristesse ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("tristesse") )
myft

tristesse
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	68.992	0.000	0.052	1.000
destination	6.000	13.514	0.000	0.056	1.000
Year	4.000	3.688	0.005	0.024	0.885
Taille_hotel	4.000	2.448	0.044	0.019	0.706
prix_classe	4.000	17.173	0.000	0.051	1.000
redacteur	3.000	18.198	0.000	0.046	1.000
note_avis	4.000	312.116	0.000	0.219	1.000
Residuals	25921.000

foo1<- aggregate(cbind(sentir,voir,entendre,verbepassé,verbefutur,verbeprésent,anxiété,colère,tristesse)~destination,data=comment_X,FUN="mean") %>% mutate(categorie=destination) %>% dplyr::select(-destination)
foo2<- aggregate(cbind(sentir,voir,entendre,verbepassé,verbefutur,verbeprésent,anxiété,colère,tristesse)~Year ,data=comment_X,FUN="mean") %>% mutate(categorie=Year)%>% dplyr::select(-Year)
foo3<- aggregate(cbind(sentir,voir,entendre,verbepassé,verbefutur,verbeprésent,anxiété,colère,tristesse)~Taille_hotel ,data=comment_X,FUN="mean") %>% mutate(categorie=Taille_hotel)%>% dplyr::select(-Taille_hotel)
foo4<- aggregate(cbind(sentir,voir,entendre,verbepassé,verbefutur,verbeprésent,anxiété,colère,tristesse)~prix_classe ,data=comment_X,FUN="mean") %>% mutate(categorie=prix_classe)%>% dplyr::select(-prix_classe)
foo5<- aggregate(cbind(sentir,voir,entendre,verbepassé,verbefutur,verbeprésent,anxiété,colère,tristesse)~redacteur ,data=comment_X,FUN="mean") %>% mutate(categorie=redacteur)%>% dplyr::select(-redacteur)
foo6<- aggregate(cbind(sentir,voir,entendre,verbepassé,verbefutur,verbeprésent,anxiété,colère,tristesse)~note_avis ,data=comment_X,FUN="mean") %>% mutate(categorie=note_avis)%>% dplyr::select(-note_avis)

foo<-rbind(foo1,foo2,foo3,foo4,foo5,foo6)
myft04 <- flextable(foo, col_keys =c("categorie","sentir","voir","entendre","verbepassé","verbefutur","verbeprésent","anxiété","colère","tristesse"))
myft04

categorie	sentir	voir	entendre	verbepassé	verbefutur	verbeprésent	anxiété	colère	tristesse
Australes	0.392	0.955	0.293	3.351	0.183	4.703	0.099	0.122	0.293
Bora Bora	0.356	1.294	0.385	4.449	0.283	5.487	0.100	0.171	0.541
Iles vent	0.357	1.213	0.397	3.168	0.258	5.080	0.117	0.174	0.435
Marquises	0.366	1.690	0.363	3.226	0.243	4.947	0.098	0.170	0.405
Moorea	0.356	1.314	0.469	4.138	0.277	5.392	0.111	0.152	0.547
Tahiti	0.453	1.215	0.503	3.870	0.243	5.399	0.119	0.142	0.459
Tuamotu	0.434	1.233	0.426	3.314	0.267	5.226	0.129	0.175	0.527
2015	0.370	1.259	0.455	3.980	0.262	5.349	0.118	0.137	0.506
2016	0.397	1.274	0.443	4.078	0.268	5.456	0.111	0.159	0.481
2017	0.398	1.206	0.421	3.943	0.264	5.371	0.112	0.165	0.485
2018	0.382	1.282	0.454	3.151	0.255	5.046	0.115	0.170	0.512
2019	0.387	1.400	0.462	3.075	0.264	4.987	0.137	0.176	0.547
1-5 chambres	0.413	1.059	0.376	3.203	0.270	5.290	0.129	0.176	0.406
6-15 chambres	0.358	1.181	0.458	3.216	0.231	5.119	0.107	0.161	0.447
15-50 chambres	0.437	1.343	0.432	3.047	0.264	5.251	0.127	0.165	0.510
50-80 chambres	0.365	1.307	0.451	4.525	0.268	5.447	0.109	0.153	0.544
80 et plus chambres	0.382	1.475	0.510	4.408	0.271	5.281	0.098	0.139	0.549
<10 000	0.467	0.859	0.486	3.155	0.255	5.460	0.123	0.139	0.372
<15 000	0.427	0.962	0.394	3.279	0.248	5.422	0.132	0.165	0.404
<25 000	0.445	1.215	0.421	3.214	0.284	5.272	0.137	0.177	0.476
<40 000	0.373	1.481	0.477	3.446	0.224	5.096	0.108	0.153	0.489
>40 000	0.352	1.331	0.453	4.544	0.284	5.392	0.100	0.154	0.560
1 avis	0.385	1.216	0.408	4.081	0.302	5.367	0.120	0.179	0.449
2 ou 3 avis	0.377	1.200	0.442	4.145	0.275	5.387	0.118	0.152	0.461
4 à 8 avis	0.403	1.347	0.455	3.365	0.235	5.254	0.110	0.160	0.554
9 et plus avis	0.410	1.410	0.488	2.998	0.225	4.979	0.103	0.150	0.567
1	0.544	0.825	0.566	4.544	0.368	5.929	0.358	0.376	0.876
2	0.529	0.974	0.512	4.325	0.296	5.698	0.181	0.265	0.903
3	0.475	1.067	0.473	4.093	0.281	5.649	0.119	0.193	0.783
4	0.398	1.335	0.459	3.614	0.239	5.269	0.105	0.143	0.527
5	0.352	1.305	0.418	3.781	0.266	5.222	0.105	0.146	0.380

Tableau de synthése topics

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~destination ,data=comment_X,FUN="mean")
foo<-melt(foo)
g17e<-ggplot(foo, aes(x=destination, y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
g17e

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~Year ,data=comment_X,FUN="mean")
foo<-melt(foo)
g17e<-ggplot(foo, aes(x=Year, y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
g17e

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~Taille_hotel ,data=comment_X,FUN="mean")
foo<-melt(foo)
g17a<-ggplot(foo, aes(x=Taille_hotel, y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
g17a

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~prix_classe
                  ,data=comment_X,FUN="mean")
foo<-melt(foo)
g17b<-ggplot(foo, aes(x=prix_classe, y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
g17b

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~redacteur
                  ,data=comment_X,FUN="mean")
foo<-melt(foo)
g17c<-ggplot(foo, aes(x=redacteur, y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
g17c

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~note_avis,data=comment_X,FUN="mean")
foo<-melt(foo)
g17d<-ggplot(foo, aes(x=note_avis, y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
g17d

anova_divlex01<- lm(dithyrambe ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("dithyrambe") )
myft

dithyrambe
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	4.670	0.031	0.013	0.580
destination	6.000	16.348	0.000	0.062	1.000
Year	4.000	87.545	0.000	0.116	1.000
Taille_hotel	4.000	47.774	0.000	0.086	1.000
prix_classe	4.000	52.138	0.000	0.090	1.000
redacteur	3.000	213.818	0.000	0.157	1.000
note_avis	4.000	199.195	0.000	0.175	1.000
Residuals	25921.000

anova_divlex01<- lm(Chaleurrelation ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("Chaleurrelation") )
myft

Chaleurrelation
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	59.639	0.000	0.048	1.000
destination	6.000	50.764	0.000	0.108	1.000
Year	4.000	32.041	0.000	0.070	1.000
Taille_hotel	4.000	276.893	0.000	0.207	1.000
prix_classe	4.000	7.633	0.000	0.034	0.998
redacteur	3.000	9.214	0.000	0.033	0.997
note_avis	4.000	289.169	0.000	0.211	1.000
Residuals	25921.000

anova_divlex01<- lm(rapportqltepx ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("rapportqltepx") )
myft

rapportqltepx
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	0.727	0.394	0.005	0.137
destination	6.000	46.347	0.000	0.104	1.000
Year	4.000	16.998	0.000	0.051	1.000
Taille_hotel	4.000	47.657	0.000	0.086	1.000
prix_classe	4.000	44.868	0.000	0.083	1.000
redacteur	3.000	70.426	0.000	0.090	1.000
note_avis	4.000	110.628	0.000	0.131	1.000
Residuals	25921.000

anova_divlex01<- lm(transit ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("transit") )
myft

transit
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	24.451	0.000	0.031	0.999
destination	6.000	487.928	0.000	0.336	1.000
Year	4.000	28.024	0.000	0.066	1.000
Taille_hotel	4.000	92.391	0.000	0.119	1.000
prix_classe	4.000	133.395	0.000	0.143	1.000
redacteur	3.000	34.735	0.000	0.063	1.000
note_avis	4.000	203.820	0.000	0.177	1.000
Residuals	25921.000

anova_divlex01<- lm(interactionclient ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("interactionclient") )
myft

interactionclient
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	321.287	0.000	0.111	1.000
destination	6.000	11.011	0.000	0.050	1.000
Year	4.000	4.179	0.002	0.025	0.924
Taille_hotel	4.000	7.953	0.000	0.035	0.998
prix_classe	4.000	41.598	0.000	0.080	1.000
redacteur	3.000	2.242	0.081	0.016	0.571
note_avis	4.000	673.378	0.000	0.322	1.000
Residuals	25921.000

anova_divlex01<- lm(motulife ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("motulife") )
myft

motulife
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	205.573	0.000	0.089	1.000
destination	6.000	123.084	0.000	0.169	1.000
Year	4.000	3.672	0.005	0.024	0.884
Taille_hotel	4.000	319.760	0.000	0.222	1.000
prix_classe	4.000	98.237	0.000	0.123	1.000
redacteur	3.000	43.088	0.000	0.071	1.000
note_avis	4.000	33.431	0.000	0.072	1.000
Residuals	25921.000

anova_divlex01<- lm(cartepostale ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("cartepostale") )
myft

cartepostale
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	20.693	0.000	0.028	0.995
destination	6.000	29.854	0.000	0.083	1.000
Year	4.000	60.539	0.000	0.097	1.000
Taille_hotel	4.000	210.638	0.000	0.180	1.000
prix_classe	4.000	57.041	0.000	0.094	1.000
redacteur	3.000	112.042	0.000	0.114	1.000
note_avis	4.000	70.706	0.000	0.104	1.000
Residuals	25921.000

anova_divlex01<- lm(potentialite ~ destination+Year+Taille_hotel+prix_classe+redacteur+note_avis, data=comment_X)
sstable<-anova_stats(car::Anova(anova_divlex01, type = 3)) 
myft <- flextable(sstable, col_keys = c("term", "df", "statistic", "p.value","cohens.f","power"))
myft<- add_header_lines(myft, values =c("potentialite") )
myft

potentialite
term	df	statistic	p.value	cohens.f	power
(Intercept)	1.000	7.957	0.005	0.018	0.805
destination	6.000	17.997	0.000	0.065	1.000
Year	4.000	49.223	0.000	0.087	1.000
Taille_hotel	4.000	27.704	0.000	0.065	1.000
prix_classe	4.000	22.652	0.000	0.059	1.000
redacteur	3.000	36.055	0.000	0.065	1.000
note_avis	4.000	38.089	0.000	0.077	1.000
Residuals	25921.000

foo1<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~destination ,data=comment_X,FUN="mean") %>% mutate(categorie=destination)%>% dplyr::select(-destination)
foo2<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~Year ,data=comment_X,FUN="mean") %>% mutate(categorie=Year)%>% dplyr::select(-Year)
foo3<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~Taille_hotel ,data=comment_X,FUN="mean") %>% mutate(categorie=Taille_hotel)%>% dplyr::select(-Taille_hotel)
foo4<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~prix_classe ,data=comment_X,FUN="mean") %>% mutate(categorie=prix_classe)%>% dplyr::select(-prix_classe)
foo5<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~redacteur ,data=comment_X,FUN="mean") %>% mutate(categorie=redacteur)%>% dplyr::select(-redacteur)
foo6<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~note_avis ,data=comment_X,FUN="mean") %>% mutate(categorie=note_avis)%>% dplyr::select(-note_avis)

foo<-rbind(foo1,foo2,foo3,foo4,foo5,foo6)
myft05 <- flextable(foo, col_keys = c("categorie","dithyrambe","Chaleurrelation", "rapportqltepx","transit","interactionclient","motulife","cartepostale","potentialite"))
myft05

categorie	dithyrambe	Chaleurrelation	rapportqltepx	transit	interactionclient	motulife	cartepostale	potentialite
Australes	0.044	0.367	0.054	0.024	0.076	0.349	0.038	0.048
Bora Bora	0.248	0.093	0.087	0.062	0.119	0.075	0.171	0.144
Iles vent	0.078	0.290	0.093	0.050	0.088	0.209	0.138	0.054
Marquises	0.069	0.268	0.115	0.080	0.100	0.165	0.149	0.053
Moorea	0.182	0.100	0.112	0.062	0.099	0.083	0.206	0.157
Tahiti	0.078	0.064	0.203	0.316	0.074	0.055	0.118	0.092
Tuamotu	0.083	0.234	0.081	0.043	0.083	0.272	0.122	0.080
2015	0.162	0.126	0.125	0.122	0.091	0.121	0.138	0.116
2016	0.152	0.129	0.114	0.133	0.085	0.112	0.146	0.128
2017	0.143	0.141	0.117	0.125	0.092	0.121	0.146	0.115
2018	0.064	0.202	0.141	0.085	0.096	0.166	0.185	0.062
2019	0.070	0.205	0.129	0.089	0.109	0.145	0.189	0.062
1-5 chambres	0.058	0.306	0.086	0.070	0.083	0.300	0.047	0.050
6-15 chambres	0.083	0.260	0.126	0.079	0.087	0.193	0.121	0.051
15-50 chambres	0.084	0.106	0.161	0.153	0.082	0.106	0.226	0.082
50-80 chambres	0.212	0.063	0.127	0.122	0.103	0.035	0.184	0.154
80 et plus chambres	0.147	0.057	0.129	0.182	0.093	0.028	0.202	0.163
<10 000	0.041	0.204	0.192	0.155	0.069	0.262	0.032	0.046
<15 000	0.046	0.214	0.143	0.218	0.071	0.222	0.044	0.041
<25 000	0.075	0.196	0.119	0.074	0.086	0.227	0.139	0.084
<40 000	0.081	0.163	0.158	0.149	0.080	0.093	0.193	0.081
>40 000	0.226	0.087	0.091	0.076	0.111	0.045	0.198	0.166
1 avis	0.199	0.178	0.081	0.087	0.102	0.125	0.116	0.112
2 ou 3 avis	0.154	0.149	0.108	0.131	0.092	0.106	0.137	0.123
4 à 8 avis	0.078	0.134	0.155	0.122	0.086	0.145	0.187	0.092
9 et plus avis	0.056	0.148	0.170	0.105	0.082	0.170	0.207	0.062
1	0.044	0.041	0.070	0.228	0.343	0.139	0.052	0.080
2	0.044	0.041	0.094	0.223	0.250	0.128	0.084	0.137
3	0.053	0.039	0.163	0.259	0.132	0.120	0.109	0.125
4	0.087	0.100	0.181	0.144	0.068	0.135	0.172	0.113
5	0.180	0.209	0.087	0.065	0.075	0.123	0.162	0.099

Qu’est-ce qui fait une bonne note ?

foo<- aggregate(cbind(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite)~note_avis,data=comment_X,FUN="mean")
foo<-melt(foo)
gN1<-ggplot(foo, aes(x=note_avis,y=value,group=variable))+geom_bar(stat="identity",aes(fill=variable))+theme_minimal()+theme_minimal()
gN1

foo1<- aggregate(cbind(sentir,voir,entendre)~note_avis,data=comment_X,FUN="mean")
foo1<-melt(foo1)
gN2<-ggplot(foo1, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+coord_flip()

foo2<- aggregate(cbind(verbepassé,verbefutur,verbeprésent)~note_avis,data=comment_X,FUN="mean")
foo2<-melt(foo2)
gN3<-ggplot(foo2, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=3, name="Zissou1"))+labs(y = "densité", caption = "") +coord_flip()


foo3<- aggregate(cbind(anxiété,colère,tristesse)~note_avis,data=comment_X,FUN="mean")
foo3<-melt(foo3)
gN4<-ggplot(foo3, aes(x=note_avis, y=value,group=variable))+geom_line(aes(color=variable),size=2)+theme_minimal()+scale_color_manual(values=wes_palette(n=4, name="Zissou1"))+coord_flip()

grid.arrange(gN2,gN3,gN4, nrow=3)

foo4<- aggregate(cbind(TTR,C)~note_avis ,data=comment_X,FUN="mean")
foo4<-melt(foo4)
gN4<-ggplot(foo4, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+scale_color_manual(values=wes_palette(n=3,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Diversité lexicale : l'effet de destination", x="destination", y="moyenne",caption = "")
gN4

foo5<- aggregate(cbind(ARI,Coleman.Liau.grade)~note_avis,data=comment_X,FUN="mean")
foo5<-melt(foo5)
gN5<-ggplot(foo5, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+theme(legend.position="none")+theme(legend.position ="none")+scale_color_manual(values=wes_palette(n=4,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))+labs(title = "Lisibilité : l'effet de notation", x="destination", y="moyenne",caption = "")
gN5

foo6<- aggregate(cbind(positivity,negativity,expressivity)~note_avis ,data=comment_X,FUN="mean")
foo6<-melt(foo6)
gN6<-ggplot(foo6, aes(x=note_avis, y=value,group=variable))+geom_line(size=2,aes(color=variable))+facet_grid(variable~.,scale="free")+theme_minimal()+labs(title = "Sentiment : l'effet du jugement", x="Note", y="moyenne",caption = "")+ theme(legend.position="none")+scale_color_manual(values=wes_palette(n=5,name="Zissou1"))+theme(axis.text.x = element_text(angle = 45))
gN6

le dernier graphique fait apparaitre des profils de notes bien distincts : ce qui distingue les 5* des autres c’est le caractère chaleureux de la relation établie avec le personnel et l’expérience qui est traduite de manière dythirambique et enthousiaste.

testons l’hypothèse avec un logit ordonné et binaire . on utilise http://larmarange.github.io/analyse-R/regression-logistique.html#regression-logistique-ordinale

la table de confusion nous donne 69% de bien classé.

comment_X$note_avis <- factor(comment_X$note_avis, c("1", "2", "3","4","5"), ordered = TRUE)
#freq(comment_X$note_avis )
#logit ordinal

library(ordinal)
library(broom)
library(effects)
library(caret) 


rego <- clm(note_avis ~-1+dithyrambe+Chaleurrelation+transit+rapportqltepx+interactionclient+motulife+cartepostale+potentialite,link = "logit",scale = ~rapportqltepx+cartepostale,data = comment_X)
summary(rego)

## formula: 
## note_avis ~ -1 + dithyrambe + Chaleurrelation + transit + rapportqltepx + interactionclient + motulife + cartepostale + potentialite
## scale:   ~rapportqltepx + cartepostale
## data:    comment_X
## 
##  link  threshold nobs  logLik    AIC      niter max.grad cond.H 
##  logit flexible  26467 -26762.78 53553.57 10(0) 4.63e-12 1.2e+05
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## dithyrambe          3.8253     0.9532   4.013 5.99e-05 ***
## Chaleurrelation     3.9168     0.9530   4.110 3.96e-05 ***
## transit             0.2103     0.9522   0.221   0.8252    
## rapportqltepx       1.0380     0.9519   1.090   0.2755    
## interactionclient   0.1648     0.9530   0.173   0.8627    
## motulife            1.8355     0.9523   1.927   0.0539 .  
## cartepostale        2.1003     0.9522   2.206   0.0274 *  
## potentialite        1.6278     0.9525   1.709   0.0874 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## log-scale coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## rapportqltepx -0.56562    0.03117  -18.15   <2e-16 ***
## cartepostale  -0.43010    0.03497  -12.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Threshold coefficients:
##     Estimate Std. Error z value
## 1|2  -1.7929     0.9518  -1.884
## 2|3  -0.8513     0.9515  -0.895
## 3|4   0.2998     0.9514   0.315
## 4|5   1.7765     0.9514   1.867

tmp1 <- tidy(rego, conf.int = TRUE, exponentiate = TRUE)
str(tmp1)

## Classes 'tbl_df', 'tbl' and 'data.frame':    14 obs. of  8 variables:
##  $ term            : chr  "1|2" "2|3" "3|4" "4|5" ...
##  $ estimate        : num  0.166 0.427 1.35 5.909 8.169 ...
##  $ std.error       : num  0.952 0.952 0.951 0.951 0.952 ...
##  $ statistic       : num  -1.884 -0.895 0.315 1.867 2.206 ...
##  $ p.value         : num  0.0596 0.371 0.7527 0.0619 0.0274 ...
##  $ conf.low        : num  NA NA NA NA 1.33 ...
##  $ conf.high       : num  NA NA NA NA 58.6 ...
##  $ coefficient_type: chr  "alpha" "alpha" "alpha" "alpha" ...

ggplot(tmp1) + aes(x = estimate, y = term, xmin = conf.low, xmax = conf.high) + 
  geom_vline(xintercept = 1) + geom_errorbarh() + geom_point() +scale_x_log10()+theme_minimal()

#plot(allEffects(rego))

#logit binaire

comment_X$note_5 <-0
comment_X$note_5[comment_X$note_avis==5] <- 1
#freq(comment_X$note_avis )
reg <- glm(note_5 ~ -1+dithyrambe+Chaleurrelation+rapportqltepx+transit+interactionclient+motulife+cartepostale+potentialite, 
  data = comment_X, family = binomial(logit))
reg

## 
## Call:  glm(formula = note_5 ~ -1 + dithyrambe + Chaleurrelation + rapportqltepx + 
##     transit + interactionclient + motulife + cartepostale + potentialite, 
##     family = binomial(logit), data = comment_X)
## 
## Coefficients:
##        dithyrambe    Chaleurrelation      rapportqltepx  
##           2.15372            2.18238           -1.17492  
##           transit  interactionclient           motulife  
##          -1.94377           -0.85652            0.06166  
##      cartepostale       potentialite  
##           0.43087           -0.12011  
## 
## Degrees of Freedom: 26467 Total (i.e. Null);  26459 Residual
## Null Deviance:       36690 
## Residual Deviance: 31280     AIC: 31290

exp(coef(reg))

##        dithyrambe   Chaleurrelation     rapportqltepx           transit 
##         8.6168706         8.8673881         0.3088436         0.1431637 
## interactionclient          motulife      cartepostale      potentialite 
##         0.4246380         1.0635983         1.5385881         0.8868249

tmp <- tidy(reg, conf.int = TRUE, exponentiate = TRUE)
str(tmp)

## Classes 'tbl_df', 'tbl' and 'data.frame':    8 obs. of  7 variables:
##  $ term     : chr  "dithyrambe" "Chaleurrelation" "rapportqltepx" "transit" ...
##  $ estimate : num  8.617 8.867 0.309 0.143 0.425 ...
##  $ std.error: num  0.0596 0.0556 0.0538 0.0603 0.0651 ...
##  $ statistic: num  36.1 39.3 -21.8 -32.2 -13.2 ...
##  $ p.value  : num  1.04e-285 0.00 9.38e-106 1.34e-227 1.51e-39 ...
##  $ conf.low : num  7.673 7.959 0.278 0.127 0.374 ...
##  $ conf.high: num  9.694 9.896 0.343 0.161 0.482 ...

ggplot(tmp) + aes(x = estimate, y = term, xmin = conf.low, xmax = conf.high) + 
  geom_vline(xintercept = 1) + geom_errorbarh() + geom_point() +scale_x_log10()+theme_minimal()

plot(allEffects(reg))

#validation
comment_X$pred <- predict(reg, type = "response", newdata = comment_X)
comment_X$pred_c<-"<5*"
comment_X$pred_c[comment_X$pred > 0.5]<-"=5*"
comment_X$true<-"<5*"
comment_X$true[comment_X$note_5==1]<-"=5*"

confusion<-table(comment_X$pred_c, comment_X$true)
confusionMatrix(confusion, positive ="=5*")

## Confusion Matrix and Statistics
## 
##      
##         <5*   =5*
##   <5*  7645  3761
##   =5*  4464 10597
##                                           
##                Accuracy : 0.6892          
##                  95% CI : (0.6836, 0.6948)
##     No Information Rate : 0.5425          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3711          
##                                           
##  Mcnemar's Test P-Value : 9.902e-15       
##                                           
##             Sensitivity : 0.7381          
##             Specificity : 0.6313          
##          Pos Pred Value : 0.7036          
##          Neg Pred Value : 0.6703          
##              Prevalence : 0.5425          
##          Detection Rate : 0.4004          
##    Detection Prevalence : 0.5690          
##       Balanced Accuracy : 0.6847          
##                                           
##        'Positive' Class : =5*             
##

precision(confusion)

## [1] 0.6702613

recall(confusion)

## [1] 0.6313486

F_meas(confusion)

## [1] 0.6502233

library(compositions)
comp<-subset(comment_X,select=c(dithyrambe,Chaleurrelation,rapportqltepx,transit,interactionclient,motulife,cartepostale,potentialite))
comp2<-clr(comp)
note_5<-subset(comment_X,select=c(note_5))
comp3<-cbind(note_5,comp2)

regc <- glm(note_5 ~ -1+dithyrambe+Chaleurrelation+rapportqltepx+transit+interactionclient+motulife+cartepostale, 
  data = comp3, family = binomial(logit))
summary(regc)

## 
## Call:
## glm(formula = note_5 ~ -1 + dithyrambe + Chaleurrelation + rapportqltepx + 
##     transit + interactionclient + motulife + cartepostale, family = binomial(logit), 
##     data = comp3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2995  -1.1372   0.8085   1.1585   2.3338  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## dithyrambe         0.49976    0.02900  17.234  < 2e-16 ***
## Chaleurrelation    0.51368    0.02915  17.624  < 2e-16 ***
## rapportqltepx     -0.29685    0.02948 -10.069  < 2e-16 ***
## transit           -0.46314    0.02928 -15.819  < 2e-16 ***
## interactionclient -0.21025    0.02997  -7.016 2.29e-12 ***
## motulife           0.03105    0.02794   1.111    0.266    
## cartepostale       0.12958    0.02779   4.664 3.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 36691  on 26467  degrees of freedom
## Residual deviance: 34842  on 26460  degrees of freedom
## AIC: 34856
## 
## Number of Fisher Scoring iterations: 4

tmp2 <- tidy(regc, conf.int = TRUE, exponentiate = TRUE)
ggplot(tmp2) + aes(x = estimate, y = term, xmin = conf.low, xmax = conf.high) + 
  geom_vline(xintercept = 1) + geom_errorbarh() + geom_point() +scale_x_log10()+theme_minimal()

plot(allEffects(regc))

note_5<-subset(comment_X,select=c(note_avis))
comp4<-cbind(note_5,comp2)
library(MASS)
regp<- polr(note_avis ~ dithyrambe+Chaleurrelation+rapportqltepx+transit+interactionclient+motulife+cartepostale, data = comp4, Hess=TRUE)
summary(regp)

## Call:
## polr(formula = note_avis ~ dithyrambe + Chaleurrelation + rapportqltepx + 
##     transit + interactionclient + motulife + cartepostale, data = comp4, 
##     Hess = TRUE)
## 
## Coefficients:
##                      Value Std. Error  t value
## dithyrambe         0.46186    0.02666  17.3245
## Chaleurrelation    0.44637    0.02667  16.7391
## rapportqltepx     -0.21517    0.02707  -7.9498
## transit           -0.48273    0.02698 -17.8953
## interactionclient -0.34827    0.02910 -11.9688
## motulife           0.01519    0.02605   0.5831
## cartepostale       0.14831    0.02585   5.7380
## 
## Intercepts:
##     Value     Std. Error t value  
## 1|2   -3.8437    0.0410   -93.8031
## 2|3   -2.8663    0.0266  -107.9502
## 3|4   -1.6757    0.0171   -98.1052
## 4|5   -0.1455    0.0128   -11.3869
## 
## Residual Deviance: 58176.49 
## AIC: 58198.49

plot(allEffects(regp))

Autres corrélations indicateurs

#C,K, Maas,positivity,negativity, expressivity,sent_score
M<-subset(comment_X, select=c(meanSentenceLength , meanWordSyllables,nbcar.x,WPS,WC,ARI, Coleman.Liau.grade,TTR,C,Maas,sent_score))
M <- cor(M)
library(corrplot)
corrplot(M, type="lower")

corrplot.mixed(M)

Les 5 V des avis - Trip Advisor en Polynésie

PG, SC, et CB - Cetop

17 septembre 2019