september update

di-danilenko · Sep 25, 2023 · 3a3b30f · 3a3b30f
1 parent 6b4ae09
commit 3a3b30f
Show file tree

Hide file tree

Showing 80 changed files with 89,981 additions and 2,319 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/Screenshot 2023-08-22 at 11.21.33.png b/Screenshot 2023-08-22 at 11.21.33.png
diff --git a/Screenshot 2023-08-22 at 11.22.35.png b/Screenshot 2023-08-22 at 11.22.35.png
diff --git a/Screenshot 2023-08-24 at 17.01.29.png b/Screenshot 2023-08-24 at 17.01.29.png
diff --git a/Screenshot 2023-09-07 at 10.46.28.png b/Screenshot 2023-09-07 at 10.46.28.png
diff --git a/Screenshot 2023-09-08 at 13.45.12.png b/Screenshot 2023-09-08 at 13.45.12.png
diff --git a/Screenshot 2023-09-08 at 13.51.05.png b/Screenshot 2023-09-08 at 13.51.05.png
diff --git a/Screenshot 2023-09-11 at 11.22.32.png b/Screenshot 2023-09-11 at 11.22.32.png
diff --git a/Screenshot 2023-09-18 at 14.08.42.png b/Screenshot 2023-09-18 at 14.08.42.png
diff --git a/Screenshot 2023-09-18 at 14.31.20.png b/Screenshot 2023-09-18 at 14.31.20.png
diff --git a/code/.DS_Store b/code/.DS_Store
diff --git a/code/.Rhistory b/code/.Rhistory
@@ -0,0 +1,238 @@
+source("~/Desktop/gender_climate_justice/code/200423_stm.R")
+knitr::opts_chunk$set(echo = TRUE)
+library("stm")
+library("quanteda")
+library("stminsights")
+library("tidyverse")
+library("readxl")
+library("dplyr")
+library("ggplot2")
+library("tidytext")
+library("gutenbergr")
+library("reshape2")
+library("gridExtra")
+library("forestplot")
+library("wesanderson")
+library("tidyr")
+data <- read.csv('data/df_full.csv', header = TRUE)
+View(data)
+data <- data %>%
+mutate(impact = ifelse(is.na(impact) | impact == "" | impact == "None", "unknown", impact))
+View(data)
+data$impact <- ifelse((is.na(impact) | impact == "" | impact == "None"), "unknown", impact)
+data$impact <- ifelse((is.na(data$impact) | data$impact == "" | data$impact == "None"), "unknown", impact)
+data$impact <- ifelse((is.na(data$impact) | data$impact == "" | data$impact == "None"), "unknown", data$impact)
+data$impact <- ifelse(is.na(data$impact), "unknown", data$impact)
+data <- data %>%
+select(-impact_quartile)
+data$text <- "text"
+### DATA PROCESSING ###
+# building a corpus using quanteda
+# combine title, abstract and keywords in a column called text
+data$text <- paste(data$title, data$abstract, data$keywords)
+data$text <- gsub("all rights reserved"," ",as.character(data$text))
+# get rid of the copyright information in our corpus
+data <- data %>% separate(text, c("text","copyright"), sep = "\\(c\\)\\s*\\d+", extra="merge", remove = TRUE) #removes (c) followed by numbers - this covers most copyright messages
+data$text <- gsub("all rights reserved"," ",as.character(data$text))
+# create the actual CORPUS
+corp <- corpus(data, text_field = "text")
+summary(corp, 3)
+data <- data %>%
+select(-X)
+# create the actual CORPUS
+corp <- corpus(data, text_field = "text")
+summary(corp, 3)
+# building a corpus using quanteda
+# combine title, abstract and keywords in a column called text
+data$text <- paste(data$title, data$abstract, data$keywords)
+# get rid of the copyright information in our corpus
+data <- data %>% separate(text, c("text","copyright"), sep = "\\(c\\)\\s*\\d+", extra="merge", remove = TRUE) #removes (c) followed by numbers - this covers most copyright messages
+data$text <- gsub("all rights reserved"," ",as.character(data$text))
+# create the actual CORPUS
+corp <- corpus(data, text_field = "text")
+summary(corp, 3)
+# the function automatically assumes all other columns contain document variables
+head(docvars(corp))
+# create TOKENS from this - this basically just means cutting up the text into individual words
+toks <- tokens(corp)
+#the below is a fairly standard list for pre-processing
+toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% #create tokens w/o punctuation or symbols
+tokens_tolower() %>% #lowercase -- note the pipe means this function doesn't need any input
+tokens_remove(pattern = stopwords("en"), #remove stopwords
+min_nchar = 2) %>% #remove short words. AMR is three letters, so best keep it to 2
+tokens_wordstem(language = "en") #Using snowball stemmer
+# create two dfms: one with unigrams and one with bi-grams
+dfm_single = dfm(toks)%>%
+dfm_trim(min_termfreq = 100, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
+toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2)
+# combine
+dfm = cbind(dfm_bigram, dfm_single)
+dfm_bigram <- dfm(toks_bigram) %>%
+dfm_trim(min_termfreq = 500, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
+dfm_bigram <- dfm(toks_bigram) %>%
+dfm_trim(min_termfreq = 500, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
+# combine
+dfm = cbind(dfm_bigram, dfm_single)
+# combine
+dfm = cbind(dfm_bigram, dfm_single)
+``
+# combine
+dfm = cbind(dfm_bigram, dfm_single)
+# show most-frequent tokens:
+topfeatures(dfm, 15) # should include some bi-grams now
+toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2)
+dfm_bigram <- dfm(toks_bigram) %>%
+dfm_trim(min_termfreq = 250, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
+# combine
+dfm = cbind(dfm_bigram, dfm_single)
+# show most-frequent tokens:
+topfeatures(dfm, 15) # should include some bi-grams now
+toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2)
+dfm_bigram <- dfm(toks_bigram) %>%
+dfm_trim(min_termfreq = 100, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
+# combine
+dfm = cbind(dfm_bigram, dfm_single)
+# show most-frequent tokens:
+topfeatures(dfm, 15) # should include some bi-grams now
+View(dfm)
+storage <- stm(dfm, K=c(75,100,125), seed=53,
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+max.em.its = 100,
+control = list(alpha = 0.5, #Lower than default of 50/k to allow documents to have more topics -> makes small topics more likely to show up
+eta = 0.1)) #Higher than default of 0.01 to create topics composed of more words -> more clarity for complex topics
+storage <- manyTopics(dfm, seed=1608,
+K=c(75,100,125),
+runs = 12,
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm),
+max.em.its = 100,
+control = list(alpha = 0.5, #Lower than default of 50/k to allow documents to have more topics -> makes small topics more likely to show up
+eta = 0.1)) #Higher than default of 0.01 to create topics composed of more words -> more clarity for complex topics
+model1 <- storage$out[[1]]
+#You can export the whole storage, but you may want to just export the model you like best
+saveRDS(model1, file = "models/170823_1.Rds")
+#You can export the whole storage, but you may want to just export the model you like best
+saveRDS(model1, file = "170823_1.Rds")
+saveRDS(model2, file = "170823_2.Rds")
+model2 <- storage$out[[2]]
+saveRDS(model2, file = "170823_2.Rds")
+model3 <- storage$out[[3]]
+saveRDS(model3, file = "170823_3.Rds")
+plot.STM(model3, type="summary", n = 5, xlim=c(0,.12), topics = climatejusticetopics3)
+plot.STM(model3, type="summary", n = 5, xlim=c(0,.12))
+plot.STM(model3, type="summary", n = 5, xlim=c(0,.12))
+plot.STM(model2, type="summary", n = 5, xlim=c(0,.12))
+# explore the outputs
+# see the topic frequency with the top n keywords:
+plot.STM(model1, type="summary", n = 5, xlim=c(0,.12))
+# print out topics with keywords
+topics <- data.frame(labelTopics(model1, c(1:model1_90$settings$dim$K), n=10)$prob)
+# print out topics with keywords
+topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob)
+# print out topics with keywords
+topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob)
+# print out topics with keywords
+topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob)
+topics1$vocab <- "text
+topics$vocab <- "text"
+topics$vocab <- paste(topics$X1,topics$X2,topics$X3,topics$X4,topics$X5,topics$X6,topics$X7, topics$X8, topics$X9, topics$X10)
+topics$vocab <- "text"
+topics$vocab <- paste(topics$X1,topics$X2,topics$X3,topics$X4,topics$X5,topics$X6,topics$X7, topics$X8, topics$X9, topics$X10)
+topics = subset(topics, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10))
+View(topics)
+topics2 <- data.frame(labelTopics(model2, c(1:model2$settings$dim$K), n=10)$prob)
+topics2$vocab <- "text"
+topics2$vocab <- paste(topics2$X1,topics2$X2,topics2$X3,topics2$X4,topics2$X5,topics2$X6,topics2$X7, topics2$X8, topics2$X9, topics2$X10)
+topics2 = subset(topics2, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10))
+topics3 <- data.frame(labelTopics(model3, c(1:model3$settings$dim$K), n=10)$prob)
+topics3$vocab <- "text"
+topics3$vocab <- paste(topics3$X1,topics3$X2,topics3$X3,topics3$X4,topics3$X5,topics3$X6,topics3$X7, topics3$X8, topics3$X9, topics3$X10)
+topics3 = subset(topics3, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10))
+write.csv(topics,"topics1.csv")
+write.csv(topics2,"topics2.csv")
+write.csv(topics3,"topics3.csv")
+kresult <- searchK(dfm,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm),
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm),
+max.em.its = 50)
+View(dfm)
+kresult <- searchK(dfm,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm),
+max.em.its = 50)
+kresult <- searchK(dfm,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm),
+max.em.its = 50)
+kresult <- searchK(dfm, seed = 2108,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm),
+max.em.its = 50)
+View(data)
+View(dfm)
+kresult <- searchK(dfm,
+seed = 2108,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm))
+kresult <- searchK(vocab = dfm,
+seed = 2108,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm))
+kresult <- searchK(documents = dfm,
+vocab = dfm,
+seed = 2108,
+K=c(100,110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm))
+kresult <- searchK(dfm,
+seed = 2108,
+K=c(110,120,130,140,150),
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm))
+set.seed(210823)
+K <- c(110,120,130,140,150)
+kresult <- searchK(dfm,
+K,
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm), replace = )
+kresult <- searchK(dfm,
+K,
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm))
+kresult <- searchK(dfm, K,
+prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
+data = docvars(dfm))
+kresult <- searchK(dfm, K,
+prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact)
+kresult <- searchK(dfm, K,
+prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact)
+kresult <- searchK(documents = dfm, K,
+prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm))
+K <- c(110,120,130,140,150)
+kresult <- searchK(documents = dfm, K,
+prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm))
+set.seed(210823)
+K <- c(110,120,130,140,150)
+kresult <- searchK(documents = dfm, K,
+prevalence =~first_author_female + last_author_female + majority_female_binary +
+s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm))