# Verschiedene Plots zu den Geburtsgeschichten und den typischen n-Grammen mit Positionsangaben in den Geschichten # Zuerst ein 3d-Plot zur Analyse von n-Grammen, dann ein 2d-Plot zur Korrelation von Position und Standardabweichung # Notwendig ist ein Datenbankzugriff auf die Daten. Dazu bitte Noah Bubenhofer kontaktieren! # Noah Bubenhofer: Serialität der Singularität. Korpusanalyse narrativer Muster in Geburtsberichten # http://www.bubenhofer.com/publikationen/geburtsberichte/ ### Versuche mit 3d-Plots ## Datenbankanbindung # DB # install.packages("RPostgreSQL") require("RPostgreSQL") # create a connection # save the password that we can "hide" it as best as we can by collapsing it pw <- { "" } # loads the PostgreSQL driver drv <- dbDriver("PostgreSQL") # creates a connection to the postgres database # note that "con" will be used later in each connection to the database con <- dbConnect(drv, dbname = "visling", host = "", port = , user = "", password = pw) rm(pw) # removes the password # check for the cartable dbExistsTable(con, c("ngrams","ngrams")) # TRUE corpusname <- "geburtsberichtelight_ngrams_lemmas"; #minfreq <- 10; minfreq <- 40; #minfreq <- 6; titleofplot <- "Geburtsberichte" query <- paste0("select ngrams.id, ngrams.\"idNgram\", ngrams.\"freqC1\", ngrams.\"freqC2\", ngrams.\"freqExpected\", ngrams.\"pValue\", ngrams.\"clusterOrder\", ngrams.\"clusterGroup\", ngrams.\"idCorpus\", ngrams.\"nrOfInstances\", ngrams.\"nrOfUniqueInstances\", ngrams.textpos_mean, ngrams.textpos_stddev, concat_ws(' ', ngrams.\"t1\", ngrams.\"t1Pos\", ngrams.\"t2\", ngrams.\"t2Pos\", ngrams.\"t3\", ngrams.\"t3Pos\", ngrams.\"t4\", ngrams.\"t4Pos\", ngrams.\"t5\", ngrams.\"t5Pos\", ngrams.\"t6\", ngrams.\"t6Pos\", ngrams.\"t7\", ngrams.\"t7Pos\", ngrams.\"t8\", ngrams.\"t8Pos\", ngrams.\"t9\", ngrams.\"t9Pos\", ngrams.\"t10\", ngrams.\"t10Pos\") as ngram, concat_ws(' ', examples.\"t1\", examples.\"t2\", examples.\"t3\", examples.\"t4\", examples.\"t5\", examples.\"t6\", examples.\"t7\", examples.\"t8\", examples.\"t9\", examples.\"t10\") as example, examples.frequency from ngrams.ngrams left join ngrams.examples on ngrams.\"idNgram\" = examples.\"idNgram\" and examples.id = (select examples.id from ngrams.examples where ngrams.\"idNgram\" = examples.\"idNgram\" and examples.\"idCorpus\" like '",corpusname,"%' order by examples.frequency desc limit 1) where ngrams.\"idCorpus\" like '",corpusname,"%' and ngrams.\"freqC1\" > ",minfreq,"") myTable <- dbGetQuery(con, query) # close the connection dbDisconnect(con) dbUnloadDriver(drv) ### 3d-Plot zur Analyse der n-Gramme in den Geschichten ## plotly library(plotly) library(htmlwidgets) ##Optional myTable2 <- myTable[1:50,] myTableOrig <- myTable myTable <- myTableOrig myTable <- myTable2 ## ggf. Subset wählen: myTable <- subset(myTable, nrOfInstances >= 70) myTable <- subset(myTable, nrOfInstances >= 200) ## Jetzt Darstellung: size <- nrow(myTable) myTable$nrOfInstancesLog <- log(myTable$nrOfInstances) p <- plot_ly(myTable, x = ~textpos_mean, y = ~clusterOrder, z = ~nrOfInstancesLog ) %>% add_markers(color= ~clusterGroup, opacity = .3, hoverinfo = 'text', text = ~paste( "n-Gram: ", ngram, "
", "häufigstes Exemplar: ", example, "", " (1/", nrOfInstances, ")
", "Durchschnittliche Position: ", textpos_mean, "
", "Standardabweichung: ", textpos_stddev, "
", "Frequenz: ", nrOfInstances, " (", log(nrOfInstances), ")
", "Cluster-Rang: ", clusterOrder, "" ), showlegend = F) %>% add_text(text = ~example, mode="markers+text", color = ~clusterGroup, hoverinfo = 'text' ) %>% onRender(" function(el, x) { el.on('plotly_click', function(d) { // d.points is an array of objects which, in this case, // is length 1 since the click is tied to 1 point. var pt = d.points[0]; var url = pt.data.text[pt.pointNumber]; console.log(url) }); } ")%>% layout(title = paste0("",titleofplot,": typische n-Gramme (Top ",size,")
Noah Bubenhofer, Universität Zürich"), scene = list(aspectmode = 'manual', aspectratio = list(x = '25', y = '20', z = '10'), xaxis = list(title = "Position im Text", gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwith = 2 ), yaxis = list( title = "Ähnlichkeit", gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwith = 2 ), zaxis = list( title = "Frequenz (log)", gridcolor = 'rgb(255, 255, 255)', zerolinewidth = 1, ticklen = 5, gridwith = 2 ) ), paper_bgcolor = 'rgb(243, 243, 243)', plot_bgcolor = 'rgb(243, 243, 243)', showlegend = FALSE ) p ### Zur Korrelation von Position und Standardabweichung (2d-Plot) # Anfang und Ende der Geschichte ist fixer als der Rest plot_ly(myTable, x= ~textpos_mean, y=~textpos_stddev) plot_ly(myTable, x= ~textpos_mean, y=~clusterOrder) # Korrelation Position und Stddev: Anfang und Ende der Geschichte ist fixer als der Rest, ggplot und plotly library(ggplot2) library("ggrepel") # simple Version qplot(textpos_mean, textpos_stddev, data=myTable, alpha=0.5, color=factor(senti1), size=frequency) # ein bisschen Sentimentanalyse: myTable$senti1 <- "[unbestimmt]"; # verschiedene Markierungen myTable$senti1[grep(paste(skandal$X1, collapse = "|"), myTable$ngram)] <- "Skandal"; myTable$senti1[grep(paste(intensivierer$X1, collapse = "|"), myTable$ngram)] <- "Intensivierer"; myTable$senti1[grep(paste(fnord$X1, collapse = "|"), myTable$ngram)] <- "fnord"; myTable$senti1[grep(paste(verschwoerung$X1, collapse = "|"), myTable$ngram)] <- "Verschwörung"; myTable$senti1[grep("Glück", myTable$ngram)] <- "Glück"; myTable$senti1[grep("Schmerz", myTable$ngram)] <- "Schmerz"; # Emotionswortschatz myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Traurigkeit",]$Term, collapse = "|"), myTable$ngram)] <- "Traurigkeit"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Freude",]$Term, collapse = "|"), myTable$ngram)] <- "Freude"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Ärger",]$Term, collapse = "|"), myTable$ngram)] <- "Ärger"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Ekel",]$Term, collapse = "|"), myTable$ngram)] <- "Ekel"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Erwartung",]$Term, collapse = "|"), myTable$ngram)] <- "Erwartung"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Furcht",]$Term, collapse = "|"), myTable$ngram)] <- "Furcht"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Überraschung",]$Term, collapse = "|"), myTable$ngram)] <- "Überraschung"; myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Vertrauen",]$Term, collapse = "|"), myTable$ngram)] <- "Vertrauen"; size <- nrow(myTable) # Hier nun der eigentliche Plot p <- ggplot(myTable, aes(x = textpos_mean, y = textpos_stddev, alpha=.6)) + geom_point(aes(color = senti1, size = frequency, name=example)) + #scale_color_brewer(palette="Set3") + scale_color_manual(values=c('#737373','#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6')) + labs(color = "Sentiments", size = "Frequenz", caption=paste0("Daten: Geburtsgeschichten aus Online-Foren, die häufigsten ", size, " n-Gramme")) + geom_smooth(show.legend = F, color = '#000000') + # für plotly nicht geeignet #geom_smooth(show.legend = F) + scale_alpha_continuous(guide = F) + labs(x="Positionen in der Geschichte", y="Standardabweichung der Position", title="Geburtsgeschichten: Typische Positionen der n-Gramme", subtitle="Korrelation von durchschnittlicher Position und Standardabweichung", fill = "Test") + theme(legend.position="none")+ # Hier Legende komplett ein- oder ausschalten geom_text_repel(segment.alpha = .5, arrow = arrow(length = unit(0.005, "npc")), show.legend = F, aes(label=ifelse( ( ( (textpos_stddev<0.15 & textpos_mean < 0.25) | (textpos_stddev<0.1 & textpos_mean > 0.75) | (textpos_stddev>0.37) ) & frequency > 30 ) ,example,''), size=12)) #+ geom_text(aes(label=ifelse(textpos_stddev<0.15,example,ifelse(textpos_stddev>0.35,example,'')), size=8)) #+ geom_text_repel(aes(label=ifelse(senti1 != '[verschiedene]',example,''), size=8)) #+ geom_text_repel(aes(label=ifelse(frequency > 200,example,''), size=8)) p <- ggplotly(p) # wenn interaktives plotly gewünscht, dann noch das hier auslösen, ansonsten ohne das. p