Un outil pour constituer des dictionnaires spécifiques. Pour le corpus “Confinementjour”" les thèmes de prédictions sont les marqueurs de l’expérience du confinement :
l’enjeu : retrouver les termes qui se rapproche le plus du concept que l’on cherche à cerner.
Processus : 1) analyse quali des tweet et définition des thèmes 2) établissement d’un petits nombre de mots cible, utilisation de la synonymie ( Wolf, https://wonef.fr/contact/) 3) recherche des vecteurs les plus proches et 4) réitération en à l’étape 2
La condition est faite pour éviter de refaire les calculs qui sont très longs ( en plusieurs heures)
On elimine :
On travaille sur des lemmes.
if (!file.exists("confinement_vec.txt")) {
#si le fichier n'existe pas, on lit nos données brutes, et on nettoie le corpus des stop words
cnlp_init_udpipe(model_name = "french")
#lecture de l'ensemble de nos tweets
obj<-readRDS(file = "df_nrcliwc17.rds")
#suppression des emojis, handlers, hashtags et URLs
cleanText <- function(x) {
gsub("(@\\S+)*(#\\S+)*(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)*", "", x)
}
obj['text'] = lapply(obj['text'], cleanText)
#on enlève les tweets qui ne contenaient que des emojis, hashtags ou urls.
obj = obj[grepl("\\w+",obj$text),]
#Annotation des tweets afin de pouvoir identifier les stopwords
Vocab<-cnlp_annotate(obj)
write_rds(Vocab,"Vocab_17.rds")
Vocab<-readRDS(file = "Vocab_17.rds")
#filtrage sur les stopwords
updated_vocab = filter(Vocab$token, !(upos %in% c('ADP','DET','SCONJ', 'PRON', 'NUM')))
all_tweets <- paste(updated_vocab['lemma'], sep= " ")
write.table(all_tweets, file="tweets.txt")
#Nettoyage des tweets et identification des n-grammes en vue d'entraîner le modèle
prep_word2vec(origin="tweets",destination="confinement_vec.txt",lowercase=T,bundle_ngrams=3)
}
prep_word2vec(origin="tweets.txt",destination="confinement_vec.txt",lowercase=T,bundle_ngrams=3)
## Starting training using file confinement_vec.txt
## Words processed: 100K Vocab size: 15K
Words processed: 200K Vocab size: 28K
Words processed: 300K Vocab size: 38K
Words processed: 400K Vocab size: 47K
Words processed: 500K Vocab size: 53K
Words processed: 600K Vocab size: 57K
Words processed: 700K Vocab size: 61K
Words processed: 800K Vocab size: 65K
Words processed: 900K Vocab size: 69K
Words processed: 1000K Vocab size: 73K
Words processed: 1100K Vocab size: 78K
Words processed: 1200K Vocab size: 85K
Words processed: 1300K Vocab size: 95K
Words processed: 1400K Vocab size: 102K
Words processed: 1500K Vocab size: 113K
Words processed: 1600K Vocab size: 125K
Words processed: 1700K Vocab size: 136K
Words processed: 1800K Vocab size: 149K
Words processed: 1900K Vocab size: 155K
Words processed: 2000K Vocab size: 160K
Words processed: 2100K Vocab size: 165K
Words processed: 2200K Vocab size: 169K
Words processed: 2300K Vocab size: 176K
Words processed: 2400K Vocab size: 185K
Words processed: 2500K Vocab size: 193K
Words processed: 2600K Vocab size: 201K
Words processed: 2700K Vocab size: 210K
Words processed: 2800K Vocab size: 219K
Words processed: 2900K Vocab size: 228K
Words processed: 3000K Vocab size: 237K
Words processed: 3100K Vocab size: 245K
Words processed: 3200K Vocab size: 253K
Words processed: 3300K Vocab size: 261K
Words processed: 3400K Vocab size: 273K
Words processed: 3500K Vocab size: 276K
Words processed: 3600K Vocab size: 280K
Words processed: 3700K Vocab size: 282K
Words processed: 3800K Vocab size: 283K
Words processed: 3900K Vocab size: 285K
Words processed: 4000K Vocab size: 286K
Words processed: 4100K Vocab size: 288K
Words processed: 4200K Vocab size: 290K
Words processed: 4300K Vocab size: 292K
Words processed: 4400K Vocab size: 295K
Words processed: 4500K Vocab size: 299K
Words processed: 4600K Vocab size: 303K
Words processed: 4700K Vocab size: 307K
Words processed: 4800K Vocab size: 311K
Words processed: 4900K Vocab size: 315K
Words processed: 5000K Vocab size: 321K
Words processed: 5100K Vocab size: 327K
Words processed: 5200K Vocab size: 334K
Words processed: 5300K Vocab size: 340K
Words processed: 5400K Vocab size: 345K
Words processed: 5500K Vocab size: 349K
Words processed: 5600K Vocab size: 351K
Words processed: 5700K Vocab size: 354K
Words processed: 5800K Vocab size: 357K
Words processed: 5900K Vocab size: 362K
Words processed: 6000K Vocab size: 366K
Words processed: 6100K Vocab size: 370K
Words processed: 6200K Vocab size: 375K
Words processed: 6300K Vocab size: 381K
Words processed: 6400K Vocab size: 386K
Words processed: 6500K Vocab size: 392K
Words processed: 6600K Vocab size: 398K
Words processed: 6700K Vocab size: 404K
Words processed: 6800K Vocab size: 410K
Words processed: 6900K Vocab size: 416K
Words processed: 7000K Vocab size: 422K
Words processed: 7100K Vocab size: 426K
Words processed: 7200K Vocab size: 429K
Words processed: 7300K Vocab size: 432K
Words processed: 7400K Vocab size: 435K
Words processed: 7500K Vocab size: 440K
Words processed: 7600K Vocab size: 445K
Words processed: 7700K Vocab size: 450K
Words processed: 7800K Vocab size: 455K
Words processed: 7900K Vocab size: 460K
Words processed: 8000K Vocab size: 465K
Words processed: 8100K Vocab size: 471K
Words processed: 8200K Vocab size: 476K
Words processed: 8300K Vocab size: 483K
Words processed: 8400K Vocab size: 490K
Words processed: 8500K Vocab size: 498K
Words processed: 8600K Vocab size: 507K
Words processed: 8700K Vocab size: 515K
Words processed: 8800K Vocab size: 520K
Words processed: 8900K Vocab size: 523K
Words processed: 9000K Vocab size: 527K
Words processed: 9100K Vocab size: 532K
Words processed: 9200K Vocab size: 538K
Words processed: 9300K Vocab size: 542K
Words processed: 9400K Vocab size: 549K
Words processed: 9500K Vocab size: 555K
Words processed: 9600K Vocab size: 561K
Words processed: 9700K Vocab size: 569K
Words processed: 9800K Vocab size: 576K
Words processed: 9900K Vocab size: 583K
Words processed: 10000K Vocab size: 590K
Words processed: 10100K Vocab size: 594K
Words processed: 10200K Vocab size: 598K
Words processed: 10300K Vocab size: 602K
Words processed: 10400K Vocab size: 607K
Words processed: 10500K Vocab size: 610K
Words processed: 10600K Vocab size: 615K
Words processed: 10700K Vocab size: 620K
Words processed: 10800K Vocab size: 625K
Words processed: 10900K Vocab size: 631K
Words processed: 11000K Vocab size: 637K
Words processed: 11100K Vocab size: 643K
Words processed: 11200K Vocab size: 651K
Words processed: 11300K Vocab size: 657K
Words processed: 11400K Vocab size: 662K
Words processed: 11500K Vocab size: 667K
Words processed: 11600K Vocab size: 672K
Words processed: 11700K Vocab size: 678K
Words processed: 11800K Vocab size: 685K
Words processed: 11900K Vocab size: 692K
Words processed: 12000K Vocab size: 697K
Words processed: 12100K Vocab size: 705K
Words processed: 12200K Vocab size: 713K
Words processed: 12300K Vocab size: 720K
Words processed: 12400K Vocab size: 725K
Words processed: 12500K Vocab size: 729K
Words processed: 12600K Vocab size: 734K
Words processed: 12700K Vocab size: 740K
Words processed: 12800K Vocab size: 746K
Words processed: 12900K Vocab size: 752K
Words processed: 13000K Vocab size: 758K
Words processed: 13100K Vocab size: 765K
Words processed: 13200K Vocab size: 767K
Words processed: 13300K Vocab size: 768K
Words processed: 13400K Vocab size: 768K
Words processed: 13500K Vocab size: 768K
Words processed: 13600K Vocab size: 769K
Words processed: 13700K Vocab size: 769K
Words processed: 13800K Vocab size: 769K
Words processed: 13900K Vocab size: 770K
Words processed: 14000K Vocab size: 771K
Words processed: 14100K Vocab size: 773K
Words processed: 14200K Vocab size: 776K
Words processed: 14300K Vocab size: 779K
Words processed: 14400K Vocab size: 782K
Words processed: 14500K Vocab size: 786K
Words processed: 14600K Vocab size: 791K
Words processed: 14700K Vocab size: 794K
Words processed: 14800K Vocab size: 799K
Words processed: 14900K Vocab size: 804K
Words processed: 15000K Vocab size: 811K
Words processed: 15100K Vocab size: 818K
Words processed: 15200K Vocab size: 821K
Words processed: 15300K Vocab size: 823K
Words processed: 15400K Vocab size: 827K
Words processed: 15500K Vocab size: 832K
Words processed: 15600K Vocab size: 837K
Words processed: 15700K Vocab size: 842K
Words processed: 15800K Vocab size: 847K
Words processed: 15900K Vocab size: 853K
Words processed: 16000K Vocab size: 859K
Words processed: 16100K Vocab size: 866K
Words processed: 16200K Vocab size: 870K
Words processed: 16300K Vocab size: 872K
Words processed: 16400K Vocab size: 875K
Words processed: 16500K Vocab size: 879K
Words processed: 16600K Vocab size: 881K
Words processed: 16700K Vocab size: 885K
Words processed: 16800K Vocab size: 889K
Words processed: 16900K Vocab size: 892K
Words processed: 17000K Vocab size: 896K
Words processed: 17100K Vocab size: 903K
Words processed: 17200K Vocab size: 906K
Words processed: 17300K Vocab size: 913K
Words processed: 17400K Vocab size: 918K
Words processed: 17500K Vocab size: 923K
Words processed: 17600K Vocab size: 928K
Words processed: 17700K Vocab size: 933K
Words processed: 17800K Vocab size: 937K
Words processed: 17900K Vocab size: 940K
Words processed: 18000K Vocab size: 944K
Words processed: 18100K Vocab size: 948K
Words processed: 18200K Vocab size: 949K
Words processed: 18300K Vocab size: 950K
Words processed: 18400K Vocab size: 950K
Words processed: 18500K Vocab size: 951K
Words processed: 18600K Vocab size: 953K
Words processed: 18700K Vocab size: 957K
Words processed: 18800K Vocab size: 960K
Words processed: 18900K Vocab size: 965K
Words processed: 19000K Vocab size: 971K
Words processed: 19100K Vocab size: 978K
Words processed: 19200K Vocab size: 984K
Words processed: 19300K Vocab size: 988K
Words processed: 19400K Vocab size: 990K
Words processed: 19500K Vocab size: 990K
Words processed: 19600K Vocab size: 991K
Words processed: 19700K Vocab size: 994K
Words processed: 19800K Vocab size: 997K
Words processed: 19900K Vocab size: 999K
Words processed: 20000K Vocab size: 1004K
Words processed: 20100K Vocab size: 1007K
Words processed: 20200K Vocab size: 1010K
Words processed: 20300K Vocab size: 1014K
Words processed: 20400K Vocab size: 1018K
Words processed: 20500K Vocab size: 1022K
Words processed: 20600K Vocab size: 1025K
Words processed: 20700K Vocab size: 1029K
Words processed: 20800K Vocab size: 1034K
Words processed: 20900K Vocab size: 1038K
Words processed: 21000K Vocab size: 1041K
Words processed: 21100K Vocab size: 1045K
Words processed: 21200K Vocab size: 1048K
Words processed: 21300K Vocab size: 1051K
Words processed: 21400K Vocab size: 1055K
Words processed: 21500K Vocab size: 1058K
Words processed: 21600K Vocab size: 1064K
## Vocab size (unigrams + bigrams): 651431
## Words in train file: 21657572
## Words written: 100K
Words written: 200K
Words written: 300K
Words written: 400K
Words written: 500K
Words written: 600K
Words written: 700K
Words written: 800K
Words written: 900K
Words written: 1000K
Words written: 1100K
Words written: 1200K
Words written: 1300K
Words written: 1400K
Words written: 1500K
Words written: 1600K
Words written: 1700K
Words written: 1800K
Words written: 1900K
Words written: 2000K
Words written: 2100K
Words written: 2200K
Words written: 2300K
Words written: 2400K
Words written: 2500K
Words written: 2600K
Words written: 2700K
Words written: 2800K
Words written: 2900K
Words written: 3000K
Words written: 3100K
Words written: 3200K
Words written: 3300K
Words written: 3400K
Words written: 3500K
Words written: 3600K
Words written: 3700K
Words written: 3800K
Words written: 3900K
Words written: 4000K
Words written: 4100K
Words written: 4200K
Words written: 4300K
Words written: 4400K
Words written: 4500K
Words written: 4600K
Words written: 4700K
Words written: 4800K
Words written: 4900K
Words written: 5000K
Words written: 5100K
Words written: 5200K
Words written: 5300K
Words written: 5400K
Words written: 5500K
Words written: 5600K
Words written: 5700K
Words written: 5800K
Words written: 5900K
Words written: 6000K
Words written: 6100K
Words written: 6200K
Words written: 6300K
Words written: 6400K
Words written: 6500K
Words written: 6600K
Words written: 6700K
Words written: 6800K
Words written: 6900K
Words written: 7000K
Words written: 7100K
Words written: 7200K
Words written: 7300K
Words written: 7400K
Words written: 7500K
Words written: 7600K
Words written: 7700K
Words written: 7800K
Words written: 7900K
Words written: 8000K
Words written: 8100K
Words written: 8200K
Words written: 8300K
Words written: 8400K
Words written: 8500K
Words written: 8600K
Words written: 8700K
Words written: 8800K
Words written: 8900K
Words written: 9000K
Words written: 9100K
Words written: 9200K
Words written: 9300K
Words written: 9400K
Words written: 9500K
Words written: 9600K
Words written: 9700K
Words written: 9800K
Words written: 9900K
Words written: 10000K
Words written: 10100K
Words written: 10200K
Words written: 10300K
Words written: 10400K
Words written: 10500K
Words written: 10600K
Words written: 10700K
Words written: 10800K
Words written: 10900K
Words written: 11000K
Words written: 11100K
Words written: 11200K
Words written: 11300K
Words written: 11400K
Words written: 11500K
Words written: 11600K
Words written: 11700K
Words written: 11800K
Words written: 11900K
Words written: 12000K
Words written: 12100K
Words written: 12200K
Words written: 12300K
Words written: 12400K
Words written: 12500K
Words written: 12600K
Words written: 12700K
Words written: 12800K
Words written: 12900K
Words written: 13000K
Words written: 13100K
Words written: 13200K
Words written: 13300K
Words written: 13400K
Words written: 13500K
Words written: 13600K
Words written: 13700K
Words written: 13800K
Words written: 13900K
Words written: 14000K
Words written: 14100K
Words written: 14200K
Words written: 14300K
Words written: 14400K
Words written: 14500K
Words written: 14600K
Words written: 14700K
Words written: 14800K
Words written: 14900K
Words written: 15000K
Words written: 15100K
Words written: 15200K
Words written: 15300K
Words written: 15400K
Words written: 15500K
Words written: 15600K
Words written: 15700K
Words written: 15800K
Words written: 15900K
Words written: 16000K
Words written: 16100K
Words written: 16200K
Words written: 16300K
Words written: 16400K
Words written: 16500K
Words written: 16600K
Words written: 16700K
Words written: 16800K
Words written: 16900K
Words written: 17000K
Words written: 17100K
Words written: 17200K
Words written: 17300K
Words written: 17400K
Words written: 17500K
Words written: 17600K
Words written: 17700K
Words written: 17800K
Words written: 17900K
Words written: 18000K
Words written: 18100K
Words written: 18200K
Words written: 18300K
Words written: 18400K
Words written: 18500K
Words written: 18600K
Words written: 18700K
Words written: 18800K
Words written: 18900K
Words written: 19000K
Words written: 19100K
Words written: 19200K
Words written: 19300K
Words written: 19400K
Words written: 19500K
Words written: 19600K
Words written: 19700K
Words written: 19800K
Words written: 19900K
Words written: 20000K
Words written: 20100K
Words written: 20200K
Words written: 20300K
Words written: 20400K
Words written: 20500K
Words written: 20600K
Words written: 20700K
Words written: 20800K
Words written: 20900K
Words written: 21000K
Words written: 21100K
Words written: 21200K
Words written: 21300K
Words written: 21400K
Words written: 21500K
Words written: 21600K
Starting training using file confinement_vec.txt_
## Words processed: 21700K Vocab size: 10K
Words processed: 21800K Vocab size: 26K
Words processed: 21900K Vocab size: 37K
Words processed: 22000K Vocab size: 49K
Words processed: 22100K Vocab size: 56K
Words processed: 22200K Vocab size: 61K
Words processed: 22300K Vocab size: 67K
Words processed: 22400K Vocab size: 72K
Words processed: 22500K Vocab size: 78K
Words processed: 22600K Vocab size: 87K
Words processed: 22700K Vocab size: 98K
Words processed: 22800K Vocab size: 108K
Words processed: 22900K Vocab size: 123K
Words processed: 23000K Vocab size: 138K
Words processed: 23100K Vocab size: 153K
Words processed: 23200K Vocab size: 162K
Words processed: 23300K Vocab size: 168K
Words processed: 23400K Vocab size: 175K
Words processed: 23500K Vocab size: 184K
Words processed: 23600K Vocab size: 196K
Words processed: 23700K Vocab size: 205K
Words processed: 23800K Vocab size: 216K
Words processed: 23900K Vocab size: 229K
Words processed: 24000K Vocab size: 241K
Words processed: 24100K Vocab size: 252K
Words processed: 24200K Vocab size: 263K
Words processed: 24300K Vocab size: 273K
Words processed: 24400K Vocab size: 288K
Words processed: 24500K Vocab size: 293K
Words processed: 24600K Vocab size: 298K
Words processed: 24700K Vocab size: 300K
Words processed: 24800K Vocab size: 302K
Words processed: 24900K Vocab size: 304K
Words processed: 25000K Vocab size: 307K
Words processed: 25100K Vocab size: 310K
Words processed: 25200K Vocab size: 315K
Words processed: 25300K Vocab size: 320K
Words processed: 25400K Vocab size: 325K
Words processed: 25500K Vocab size: 331K
Words processed: 25600K Vocab size: 337K
Words processed: 25700K Vocab size: 345K
Words processed: 25800K Vocab size: 354K
Words processed: 25900K Vocab size: 361K
Words processed: 26000K Vocab size: 369K
Words processed: 26100K Vocab size: 373K
Words processed: 26200K Vocab size: 377K
Words processed: 26300K Vocab size: 381K
Words processed: 26400K Vocab size: 388K
Words processed: 26500K Vocab size: 394K
Words processed: 26600K Vocab size: 399K
Words processed: 26700K Vocab size: 407K
Words processed: 26800K Vocab size: 415K
Words processed: 26900K Vocab size: 423K
Words processed: 27000K Vocab size: 431K
Words processed: 27100K Vocab size: 439K
Words processed: 27200K Vocab size: 448K
Words processed: 27300K Vocab size: 455K
Words processed: 27400K Vocab size: 460K
Words processed: 27500K Vocab size: 464K
Words processed: 27600K Vocab size: 469K
Words processed: 27700K Vocab size: 476K
Words processed: 27800K Vocab size: 483K
Words processed: 27900K Vocab size: 490K
Words processed: 28000K Vocab size: 496K
Words processed: 28100K Vocab size: 505K
Words processed: 28200K Vocab size: 513K
Words processed: 28300K Vocab size: 522K
Words processed: 28400K Vocab size: 532K
Words processed: 28500K Vocab size: 543K
Words processed: 28600K Vocab size: 554K
Words processed: 28700K Vocab size: 562K
Words processed: 28800K Vocab size: 566K
Words processed: 28900K Vocab size: 573K
Words processed: 29000K Vocab size: 581K
Words processed: 29100K Vocab size: 586K
Words processed: 29200K Vocab size: 596K
Words processed: 29300K Vocab size: 604K
Words processed: 29400K Vocab size: 614K
Words processed: 29500K Vocab size: 623K
Words processed: 29600K Vocab size: 633K
Words processed: 29700K Vocab size: 642K
Words processed: 29800K Vocab size: 647K
Words processed: 29900K Vocab size: 653K
Words processed: 30000K Vocab size: 659K
Words processed: 30100K Vocab size: 664K
Words processed: 30200K Vocab size: 671K
Words processed: 30300K Vocab size: 678K
Words processed: 30400K Vocab size: 686K
Words processed: 30500K Vocab size: 694K
Words processed: 30600K Vocab size: 703K
Words processed: 30700K Vocab size: 713K
Words processed: 30800K Vocab size: 720K
Words processed: 30900K Vocab size: 728K
Words processed: 31000K Vocab size: 735K
Words processed: 31100K Vocab size: 744K
Words processed: 31200K Vocab size: 754K
Words processed: 31300K Vocab size: 761K
Words processed: 31400K Vocab size: 772K
Words processed: 31500K Vocab size: 784K
Words processed: 31600K Vocab size: 791K
Words processed: 31700K Vocab size: 797K
Words processed: 31800K Vocab size: 805K
Words processed: 31900K Vocab size: 813K
Words processed: 32000K Vocab size: 820K
Words processed: 32100K Vocab size: 829K
Words processed: 32200K Vocab size: 838K
Words processed: 32300K Vocab size: 840K
Words processed: 32400K Vocab size: 840K
Words processed: 32500K Vocab size: 841K
Words processed: 32600K Vocab size: 841K
Words processed: 32700K Vocab size: 842K
Words processed: 32800K Vocab size: 843K
Words processed: 32900K Vocab size: 843K
Words processed: 33000K Vocab size: 846K
Words processed: 33100K Vocab size: 851K
Words processed: 33200K Vocab size: 855K
Words processed: 33300K Vocab size: 860K
Words processed: 33400K Vocab size: 866K
Words processed: 33500K Vocab size: 872K
Words processed: 33600K Vocab size: 878K
Words processed: 33700K Vocab size: 886K
Words processed: 33800K Vocab size: 897K
Words processed: 33900K Vocab size: 901K
Words processed: 34000K Vocab size: 905K
Words processed: 34100K Vocab size: 912K
Words processed: 34200K Vocab size: 918K
Words processed: 34300K Vocab size: 924K
Words processed: 34400K Vocab size: 932K
Words processed: 34500K Vocab size: 940K
Words processed: 34600K Vocab size: 949K
Words processed: 34700K Vocab size: 956K
Words processed: 34800K Vocab size: 960K
Words processed: 34900K Vocab size: 964K
Words processed: 35000K Vocab size: 968K
Words processed: 35100K Vocab size: 973K
Words processed: 35200K Vocab size: 978K
Words processed: 35300K Vocab size: 982K
Words processed: 35400K Vocab size: 990K
Words processed: 35500K Vocab size: 995K
Words processed: 35600K Vocab size: 1005K
Words processed: 35700K Vocab size: 1013K
Words processed: 35800K Vocab size: 1020K
Words processed: 35900K Vocab size: 1028K
Words processed: 36000K Vocab size: 1033K
Words processed: 36100K Vocab size: 1038K
Words processed: 36200K Vocab size: 1044K
Words processed: 36300K Vocab size: 1048K
Words processed: 36400K Vocab size: 1048K
Words processed: 36500K Vocab size: 1049K
Words processed: 36600K Vocab size: 1050K
Words processed: 36700K Vocab size: 1057K
Words processed: 36800K Vocab size: 1061K
Words processed: 36900K Vocab size: 1068K
Words processed: 37000K Vocab size: 1077K
Words processed: 37100K Vocab size: 1085K
Words processed: 37200K Vocab size: 1091K
Words processed: 37300K Vocab size: 1095K
Words processed: 37400K Vocab size: 1095K
Words processed: 37500K Vocab size: 1098K
Words processed: 37600K Vocab size: 1102K
Words processed: 37700K Vocab size: 1106K
Words processed: 37800K Vocab size: 1112K
Words processed: 37900K Vocab size: 1116K
Words processed: 38000K Vocab size: 1122K
Words processed: 38100K Vocab size: 1127K
Words processed: 38200K Vocab size: 1133K
Words processed: 38300K Vocab size: 1137K
Words processed: 38400K Vocab size: 1145K
Words processed: 38500K Vocab size: 1151K
Words processed: 38600K Vocab size: 1155K
Words processed: 38700K Vocab size: 1161K
Words processed: 38800K Vocab size: 1165K
Words processed: 38900K Vocab size: 1170K
Words processed: 39000K Vocab size: 1175K
Words processed: 39100K Vocab size: 1184K
## Vocab size (unigrams + bigrams): 716742
## Words in train file: 39106573
## Words written: 100K
Words written: 200K
Words written: 300K
Words written: 400K
Words written: 500K
Words written: 600K
Words written: 700K
Words written: 800K
Words written: 900K
Words written: 1000K
Words written: 1100K
Words written: 1200K
Words written: 1300K
Words written: 1400K
Words written: 1500K
Words written: 1600K
Words written: 1700K
Words written: 1800K
Words written: 1900K
Words written: 2000K
Words written: 2100K
Words written: 2200K
Words written: 2300K
Words written: 2400K
Words written: 2500K
Words written: 2600K
Words written: 2700K
Words written: 2800K
Words written: 2900K
Words written: 3000K
Words written: 3100K
Words written: 3200K
Words written: 3300K
Words written: 3400K
Words written: 3500K
Words written: 3600K
Words written: 3700K
Words written: 3800K
Words written: 3900K
Words written: 4000K
Words written: 4100K
Words written: 4200K
Words written: 4300K
Words written: 4400K
Words written: 4500K
Words written: 4600K
Words written: 4700K
Words written: 4800K
Words written: 4900K
Words written: 5000K
Words written: 5100K
Words written: 5200K
Words written: 5300K
Words written: 5400K
Words written: 5500K
Words written: 5600K
Words written: 5700K
Words written: 5800K
Words written: 5900K
Words written: 6000K
Words written: 6100K
Words written: 6200K
Words written: 6300K
Words written: 6400K
Words written: 6500K
Words written: 6600K
Words written: 6700K
Words written: 6800K
Words written: 6900K
Words written: 7000K
Words written: 7100K
Words written: 7200K
Words written: 7300K
Words written: 7400K
Words written: 7500K
Words written: 7600K
Words written: 7700K
Words written: 7800K
Words written: 7900K
Words written: 8000K
Words written: 8100K
Words written: 8200K
Words written: 8300K
Words written: 8400K
Words written: 8500K
Words written: 8600K
Words written: 8700K
Words written: 8800K
Words written: 8900K
Words written: 9000K
Words written: 9100K
Words written: 9200K
Words written: 9300K
Words written: 9400K
Words written: 9500K
Words written: 9600K
Words written: 9700K
Words written: 9800K
Words written: 9900K
Words written: 10000K
Words written: 10100K
Words written: 10200K
Words written: 10300K
Words written: 10400K
Words written: 10500K
Words written: 10600K
Words written: 10700K
Words written: 10800K
Words written: 10900K
Words written: 11000K
Words written: 11100K
Words written: 11200K
Words written: 11300K
Words written: 11400K
Words written: 11500K
Words written: 11600K
Words written: 11700K
Words written: 11800K
Words written: 11900K
Words written: 12000K
Words written: 12100K
Words written: 12200K
Words written: 12300K
Words written: 12400K
Words written: 12500K
Words written: 12600K
Words written: 12700K
Words written: 12800K
Words written: 12900K
Words written: 13000K
Words written: 13100K
Words written: 13200K
Words written: 13300K
Words written: 13400K
Words written: 13500K
Words written: 13600K
Words written: 13700K
Words written: 13800K
Words written: 13900K
Words written: 14000K
Words written: 14100K
Words written: 14200K
Words written: 14300K
Words written: 14400K
Words written: 14500K
Words written: 14600K
Words written: 14700K
Words written: 14800K
Words written: 14900K
Words written: 15000K
Words written: 15100K
Words written: 15200K
Words written: 15300K
Words written: 15400K
Words written: 15500K
Words written: 15600K
Words written: 15700K
Words written: 15800K
Words written: 15900K
Words written: 16000K
Words written: 16100K
Words written: 16200K
Words written: 16300K
Words written: 16400K
Words written: 16500K
Words written: 16600K
Words written: 16700K
Words written: 16800K
Words written: 16900K
Words written: 17000K
Words written: 17100K
Words written: 17200K
Words written: 17300K
Words written: 17400K
#Création et entraînement du modèle vectoriel
if (!file.exists("confinement_vec_model.bin")) {
model = train_word2vec("confinement_vec.txt","confinement_vec.bin",vectors=100
,threads=4,window=10,iter=5,negative_samples=0)
} else model = read.vectors("confinement_vec_model.bin")
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 5%
|
|==== | 6%
|
|===== | 6%
|
|===== | 7%
|
|===== | 8%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======= | 11%
|
|======== | 11%
|
|======== | 12%
|
|========= | 12%
|
|========= | 13%
|
|========= | 14%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 15%
|
|=========== | 16%
|
|============ | 16%
|
|============ | 17%
|
|============ | 18%
|
|============= | 18%
|
|============= | 19%
|
|============== | 19%
|
|============== | 20%
|
|============== | 21%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 22%
|
|================ | 23%
|
|================ | 24%
|
|================= | 24%
|
|================= | 25%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|====================== | 32%
|
|======================= | 32%
|
|======================= | 33%
|
|======================= | 34%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================ | 41%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 42%
|
|============================== | 43%
|
|============================== | 44%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================= | 48%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 49%
|
|=================================== | 50%
|
|=================================== | 51%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|====================================== | 55%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 56%
|
|======================================== | 57%
|
|======================================== | 58%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|=========================================== | 62%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================ | 64%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================= | 71%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|=================================================== | 74%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 76%
|
|====================================================== | 77%
|
|====================================================== | 78%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 79%
|
|======================================================== | 80%
|
|======================================================== | 81%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 82%
|
|========================================================== | 83%
|
|========================================================== | 84%
|
|=========================================================== | 84%
|
|=========================================================== | 85%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 86%
|
|============================================================= | 87%
|
|============================================================= | 88%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 92%
|
|================================================================= | 93%
|
|================================================================= | 94%
|
|================================================================== | 94%
|
|================================================================== | 95%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 99%
|
|======================================================================| 100%
#Un premier clustering
set.seed(10)
centers = 150
clustering = kmeans(model,centers=centers,iter.max = 40)
sapply(sample(1:centers,10),function(n) {
names(clustering$cluster[clustering$cluster==n][1:15])
})
## [,1] [,2]
## [1,] "d_put" "quartiers"
## [2,] "0001f1f7_pand_mie" "t_couvre_feu"
## [3,] "poisson_avril_farce_soit" "l_tat_ferme"
## [4,] "0001f41f_connaissez_origine_tradition" "yeux_non_respect"
## [5,] "o_tout_monde" "ait_demand_retirer_arr"
## [6,] "pas_eu_que" "rime_divertissement"
## [7,] "effets_n_fastes" "d_andorre_confinement"
## [8,] "d_quid_sauvage" "pense_toutes_sorties_j"
## [9,] "l_pensait_teinte_moyen" "ai_refus_parce"
## [10,] "avant_croire" "suis_choqu_pr_fet"
## [11,] "ou_partager_reste" "seine_saint_denis_ait"
## [12,] "chose_reste_ann" "mis_garde_maire"
## [13,] "v_rifie_source_information" "viter_propagation_du"
## [14,] "amende" "j_avais_flemme"
## [15,] "ge_licorne_sylvestre" "pfiouuuuuu"
## [,3] [,4]
## [1,] "morpion_chat" "0001f1f7"
## [2,] "viens_perdre_249eme_partie" "0001f1ee"
## [3,] "249eme_partie_morpion_chat" "0001f1ea"
## [4,] "0001f62d_viens_perdre" "0001f1f9"
## [5,] "des_dizaines_milliers" "0001f1ec"
## [6,] "ne_tient" "0001f1e9"
## [7,] "d_emplois" "0001f1f3"
## [8,] "ans_gr_ve" "0001f1e6"
## [9,] "aujourd'hui_syst_me" "envoy"
## [10,] "lits_supprim_s" "0001f1f2"
## [11,] "pourtant_tir_sonnette_alarme" "0001f1f8"
## [12,] "an_personnel_hospitalier_avait" "2192"
## [13,] "a_cause_homme" "0001f1e7"
## [14,] "d_laisser_mot" "0001f1f7_visitant_h_pital"
## [15,] "pangolins_danger_extinction" "aux_jeunes"
## [,5] [,6]
## [1,] "atteints_du" "respect"
## [2,] "compte_cas" "danger"
## [3,] "masque_plong_e" "sol"
## [4,] "72heures_pour_accueillir_patients" "ne_sert"
## [5,] "h_pital_terrain_militaire" "faire_respecter_confinement"
## [6,] "0001f1f9_va_construire" "non_respect"
## [7,] "provenance" "quand"
## [8,] "adapt_soignants_fran_ais" "zones_non_droit"
## [9,] "sont_avantages" "envoie_arm_e"
## [10,] "olivier_v_ran_ministre" "d_insister_mais"
## [11,] "a_roport_international" "sant_qu_mettent"
## [12,] "bloquent_piste_atterrissage_vols" "porte_parole_du"
## [13,] "que_reconna_t" "foutu"
## [14,] "aux_familles" "mort_gars"
## [15,] "co_te_200" "prot_g"
## [,7] [,8]
## [1,] "20e3" "meurt"
## [2,] "1" "peut_donner"
## [3,] "20e3_0" "cesser_faire"
## [4,] "2" "uvre_va"
## [5,] "20e3_heures" "figure_crit_re_ultime"
## [6,] "3" "cervelles_occidentales_grand_entretien"
## [7,] "20e3_2" "d_capant_grand"
## [8,] "20e3_8" "marcel_gauchet_lire"
## [9,] "20e3_6" "bas_co_t"
## [10,] "20e3_4" "faites_courses"
## [11,] "20e3_1" "nouvelle_version_attestation"
## [12,] "20e3_3" "pic_pid_mique"
## [13,] "4" "mission_sp_ciale"
## [14,] "20e3_minutes_1" "totalit"
## [15,] "mesures_urgence" "jour_nuit_veillons"
## [,9] [,10]
## [1,] "m_me_aujourdhui" "0001f1fa"
## [2,] "vais_passer_lendemain_sera" "mondiale"
## [3,] "pensant_journ_e" "a_marre"
## [4,] "fois_vais_aller_coucher" "masqu_e_confinement"
## [5,] "partie_population" "gr_ce_vie_continue"
## [6,] "vois_quelqu_dehors" "l_activit_professionnelle"
## [7,] "pr_occuper" "presque_avant_merci"
## [8,] "couvre_feu_tonnement_toute" "0001f465_j_profite_saluer"
## [9,] "partout_monde" "du_travail_invisible"
## [10,] "aujourd'hui_18_me_arrondissement" "journ_e_mondiale"
## [11,] "g_nies_manifestent_peu" "aux_etats_unis"
## [12,] "h_pital_georges_pompidou" "n_ficient_protection_sociale"
## [13,] "direct_mieux_comprendre" "0001f1f8_pand_mie"
## [14,] "au_extrait" "faire_soigner_du"
## [15,] "d_longue_interview" "co_te_moyenne_hospitalisation"
L’intérêt des vecteurs c’est qu’il peuvent d’additionner ou se soustraire :
roi-homme+femme = reine
pour reprendre cet exemple fameux.
foo<-model %>% closest_to(~"malade"+"patient" +"hopital"+"crise",40) # + "crise" + "patients" + "malades"
foo = foo [-1:-3,]
foo$Similarity<-foo[,2]
g1<-ggplot(foo, aes(x=reorder(word, Similarity),y=Similarity))+geom_point(col="black",size=3)+coord_flip()+theme_minimal()+ggtitle("N-grammes proches de la dimension sanitaire")
g1
foo<-model %>% closest_to(~ "lutte"+"solidarit",30)
foo = foo [-1:-2,]
foo$Similarity<-foo[,2]
g1<-ggplot(foo, aes(x=reorder(word, Similarity),y=Similarity))+geom_point(col="black",size=3)+coord_flip()+theme_minimal()+scale_y_log10()+ggtitle("N-grammes proches de la solidarite")
g1
foo<-model %>% closest_to(~"distance"+"sociale",30)
foo = foo [-1:-2,]
foo$Similarity<-foo[,2]
g1<-ggplot(foo, aes(x=reorder(word, Similarity),y=Similarity))+geom_point(col="black",size=3)+coord_flip()+theme_minimal()+scale_y_log10()+ggtitle("N-grammes proches de la protection")
g1
foo<-model %>% closest_to(~"politique" + "gouvernement",30)
foo = foo [-1:-2,]
foo$Similarity<-foo[,2]
g1<-ggplot(foo, aes(x=reorder(word, Similarity),y=Similarity))+geom_point(col="black",size=3)+coord_flip()+theme_minimal()+scale_y_log10()+ggtitle("N-grammes proches de la politique en temps de confinement")
g1
foo<-model %>% closest_to(~"confin"+"appartement" - "maison" + "studio",40)
foo = foo [-1:-3,]
foo$Similarity<-foo[,2]
g1<-ggplot(foo, aes(x=reorder(word, Similarity),y=Similarity))+geom_point(col="black",size=3)+coord_flip()+theme_minimal()+scale_y_log10()+ggtitle("N-grammes proches du lieu de confinement")
g1
q_words = c("teletravail", "travail", "internet", "connexion")
term_set = lapply(q_words,
function(q_word) {
nearest_words = model %>% closest_to(model[[q_word]],30)
nearest_words$word
}) %>% unlist
subset = model[[term_set,average=F]]
library(Rtsne)
library(RColorBrewer)
# run Rtsne with default parameters
set.seed(57)
rtsne_out <- Rtsne(as.matrix(subset), perplexity=5)
# plot the output of Rtsne into d:\\barneshutplot.jpg file of 2400x1800 dimension
#jpeg("fig.jpg", width=2400, height=1800)
color.vec = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58", "seagreen1", "seagreen4", "slateblue4")
brewer.pal(8, "Set3")
## [1] "#8DD3C7" "#FFFFB3" "#BEBADA" "#FB8072" "#80B1D3" "#FDB462" "#B3DE69"
## [8] "#FCCDE5"
#clus<-as.data.frame(clus)
#clus$word<-rownames(clus)
terms<-as.data.frame(rownames(subset))
terms$word<-terms[,1]
#terms<-terms %>% left_join(clus, by = "word")
plot(rtsne_out$Y, t='n')
#count(terms, clus)$n[2]
text(rtsne_out$Y, labels=rownames(subset),cex=0.7)#col=color.vec[terms$clus])