# Verschiedene Plots zu den Geburtsgeschichten und den typischen n-Grammen mit Positionsangaben in den Geschichten
# Zuerst ein 3d-Plot zur Analyse von n-Grammen, dann ein 2d-Plot zur Korrelation von Position und Standardabweichung
# Notwendig ist ein Datenbankzugriff auf die Daten. Dazu bitte Noah Bubenhofer kontaktieren!
# Noah Bubenhofer: Serialität der Singularität. Korpusanalyse narrativer Muster in Geburtsberichten
# http://www.bubenhofer.com/publikationen/geburtsberichte/
### Versuche mit 3d-Plots
## Datenbankanbindung
# DB
# install.packages("RPostgreSQL")
require("RPostgreSQL")
# create a connection
# save the password that we can "hide" it as best as we can by collapsing it
pw <- {
""
}
# loads the PostgreSQL driver
drv <- dbDriver("PostgreSQL")
# creates a connection to the postgres database
# note that "con" will be used later in each connection to the database
con <- dbConnect(drv, dbname = "visling",
host = "", port = ,
user = "", password = pw)
rm(pw) # removes the password
# check for the cartable
dbExistsTable(con, c("ngrams","ngrams"))
# TRUE
corpusname <- "geburtsberichtelight_ngrams_lemmas";
#minfreq <- 10;
minfreq <- 40;
#minfreq <- 6;
titleofplot <- "Geburtsberichte"
query <- paste0("select ngrams.id, ngrams.\"idNgram\", ngrams.\"freqC1\", ngrams.\"freqC2\",
ngrams.\"freqExpected\", ngrams.\"pValue\", ngrams.\"clusterOrder\", ngrams.\"clusterGroup\",
ngrams.\"idCorpus\", ngrams.\"nrOfInstances\", ngrams.\"nrOfUniqueInstances\",
ngrams.textpos_mean, ngrams.textpos_stddev,
concat_ws(' ', ngrams.\"t1\", ngrams.\"t1Pos\", ngrams.\"t2\", ngrams.\"t2Pos\", ngrams.\"t3\",
ngrams.\"t3Pos\", ngrams.\"t4\", ngrams.\"t4Pos\", ngrams.\"t5\", ngrams.\"t5Pos\", ngrams.\"t6\",
ngrams.\"t6Pos\", ngrams.\"t7\", ngrams.\"t7Pos\", ngrams.\"t8\", ngrams.\"t8Pos\", ngrams.\"t9\",
ngrams.\"t9Pos\", ngrams.\"t10\", ngrams.\"t10Pos\") as ngram,
concat_ws(' ', examples.\"t1\", examples.\"t2\", examples.\"t3\", examples.\"t4\",
examples.\"t5\", examples.\"t6\", examples.\"t7\", examples.\"t8\", examples.\"t9\",
examples.\"t10\") as example,
examples.frequency from ngrams.ngrams
left join ngrams.examples on ngrams.\"idNgram\" = examples.\"idNgram\"
and examples.id = (select examples.id from ngrams.examples
where ngrams.\"idNgram\" = examples.\"idNgram\"
and examples.\"idCorpus\" like '",corpusname,"%'
order by examples.frequency desc limit 1)
where ngrams.\"idCorpus\" like '",corpusname,"%'
and ngrams.\"freqC1\" > ",minfreq,"")
myTable <- dbGetQuery(con, query)
# close the connection
dbDisconnect(con)
dbUnloadDriver(drv)
### 3d-Plot zur Analyse der n-Gramme in den Geschichten
## plotly
library(plotly)
library(htmlwidgets)
##Optional
myTable2 <- myTable[1:50,]
myTableOrig <- myTable
myTable <- myTableOrig
myTable <- myTable2
## ggf. Subset wählen:
myTable <- subset(myTable, nrOfInstances >= 70)
myTable <- subset(myTable, nrOfInstances >= 200)
## Jetzt Darstellung:
size <- nrow(myTable)
myTable$nrOfInstancesLog <- log(myTable$nrOfInstances)
p <- plot_ly(myTable, x = ~textpos_mean, y = ~clusterOrder, z = ~nrOfInstancesLog
) %>%
add_markers(color= ~clusterGroup, opacity = .3,
hoverinfo = 'text',
text = ~paste(
"n-Gram: ", ngram, "
",
"häufigstes Exemplar: ", example, "",
" (1/", nrOfInstances, ")
",
"Durchschnittliche Position: ", textpos_mean, "
",
"Standardabweichung: ", textpos_stddev, "
",
"Frequenz: ", nrOfInstances, " (", log(nrOfInstances), ")
",
"Cluster-Rang: ", clusterOrder, ""
),
showlegend = F) %>%
add_text(text = ~example, mode="markers+text", color = ~clusterGroup,
hoverinfo = 'text'
) %>%
onRender("
function(el, x) {
el.on('plotly_click', function(d) {
// d.points is an array of objects which, in this case,
// is length 1 since the click is tied to 1 point.
var pt = d.points[0];
var url = pt.data.text[pt.pointNumber];
console.log(url)
});
}
")%>%
layout(title = paste0("",titleofplot,": typische n-Gramme (Top ",size,")
Noah Bubenhofer, Universität Zürich"),
scene = list(aspectmode = 'manual', aspectratio = list(x = '25', y = '20', z = '10'),
xaxis = list(title = "Position im Text",
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwith = 2
),
yaxis = list(
title = "Ähnlichkeit",
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwith = 2
),
zaxis = list(
title = "Frequenz (log)",
gridcolor = 'rgb(255, 255, 255)',
zerolinewidth = 1,
ticklen = 5,
gridwith = 2
)
),
paper_bgcolor = 'rgb(243, 243, 243)',
plot_bgcolor = 'rgb(243, 243, 243)',
showlegend = FALSE
)
p
### Zur Korrelation von Position und Standardabweichung (2d-Plot)
# Anfang und Ende der Geschichte ist fixer als der Rest
plot_ly(myTable, x= ~textpos_mean, y=~textpos_stddev)
plot_ly(myTable, x= ~textpos_mean, y=~clusterOrder)
# Korrelation Position und Stddev: Anfang und Ende der Geschichte ist fixer als der Rest, ggplot und plotly
library(ggplot2)
library("ggrepel")
# simple Version
qplot(textpos_mean, textpos_stddev, data=myTable, alpha=0.5, color=factor(senti1), size=frequency)
# ein bisschen Sentimentanalyse:
myTable$senti1 <- "[unbestimmt]";
# verschiedene Markierungen
myTable$senti1[grep(paste(skandal$X1, collapse = "|"), myTable$ngram)] <- "Skandal";
myTable$senti1[grep(paste(intensivierer$X1, collapse = "|"), myTable$ngram)] <- "Intensivierer";
myTable$senti1[grep(paste(fnord$X1, collapse = "|"), myTable$ngram)] <- "fnord";
myTable$senti1[grep(paste(verschwoerung$X1, collapse = "|"), myTable$ngram)] <- "Verschwörung";
myTable$senti1[grep("Glück", myTable$ngram)] <- "Glück";
myTable$senti1[grep("Schmerz", myTable$ngram)] <- "Schmerz";
# Emotionswortschatz
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Traurigkeit",]$Term, collapse = "|"), myTable$ngram)] <- "Traurigkeit";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Freude",]$Term, collapse = "|"), myTable$ngram)] <- "Freude";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Ärger",]$Term, collapse = "|"), myTable$ngram)] <- "Ärger";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Ekel",]$Term, collapse = "|"), myTable$ngram)] <- "Ekel";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Erwartung",]$Term, collapse = "|"), myTable$ngram)] <- "Erwartung";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Furcht",]$Term, collapse = "|"), myTable$ngram)] <- "Furcht";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Überraschung",]$Term, collapse = "|"), myTable$ngram)] <- "Überraschung";
myTable$senti1[grep(paste(emotionen[emotionen$Emotion == "Vertrauen",]$Term, collapse = "|"), myTable$ngram)] <- "Vertrauen";
size <- nrow(myTable)
# Hier nun der eigentliche Plot
p <- ggplot(myTable, aes(x = textpos_mean, y = textpos_stddev, alpha=.6)) +
geom_point(aes(color = senti1, size = frequency, name=example)) +
#scale_color_brewer(palette="Set3") +
scale_color_manual(values=c('#737373','#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6')) +
labs(color = "Sentiments", size = "Frequenz", caption=paste0("Daten: Geburtsgeschichten aus Online-Foren, die häufigsten ", size, " n-Gramme")) +
geom_smooth(show.legend = F, color = '#000000') + # für plotly nicht geeignet
#geom_smooth(show.legend = F) +
scale_alpha_continuous(guide = F) +
labs(x="Positionen in der Geschichte", y="Standardabweichung der Position",
title="Geburtsgeschichten: Typische Positionen der n-Gramme",
subtitle="Korrelation von durchschnittlicher Position und Standardabweichung",
fill = "Test") +
theme(legend.position="none")+ # Hier Legende komplett ein- oder ausschalten
geom_text_repel(segment.alpha = .5, arrow = arrow(length = unit(0.005, "npc")), show.legend = F,
aes(label=ifelse(
(
(
(textpos_stddev<0.15 & textpos_mean < 0.25) |
(textpos_stddev<0.1 & textpos_mean > 0.75) |
(textpos_stddev>0.37)
) & frequency > 30
)
,example,''), size=12))
#+ geom_text(aes(label=ifelse(textpos_stddev<0.15,example,ifelse(textpos_stddev>0.35,example,'')), size=8))
#+ geom_text_repel(aes(label=ifelse(senti1 != '[verschiedene]',example,''), size=8))
#+ geom_text_repel(aes(label=ifelse(frequency > 200,example,''), size=8))
p <- ggplotly(p) # wenn interaktives plotly gewünscht, dann noch das hier auslösen, ansonsten ohne das.
p