Skip to content

Commit

Permalink
september update
Browse files Browse the repository at this point in the history
  • Loading branch information
di-danilenko committed Sep 25, 2023
1 parent 6b4ae09 commit 3a3b30f
Show file tree
Hide file tree
Showing 80 changed files with 89,981 additions and 2,319 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added Screenshot 2023-08-22 at 11.21.33.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-08-22 at 11.22.35.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-08-24 at 17.01.29.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-09-07 at 10.46.28.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-09-08 at 13.45.12.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-09-08 at 13.51.05.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-09-11 at 11.22.32.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-09-18 at 14.08.42.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added Screenshot 2023-09-18 at 14.31.20.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified code/.DS_Store
Binary file not shown.
238 changes: 238 additions & 0 deletions code/.Rhistory
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
source("~/Desktop/gender_climate_justice/code/200423_stm.R")
knitr::opts_chunk$set(echo = TRUE)
library("stm")
library("quanteda")
library("stminsights")
library("tidyverse")
library("readxl")
library("dplyr")
library("ggplot2")
library("tidytext")
library("gutenbergr")
library("reshape2")
library("gridExtra")
library("forestplot")
library("wesanderson")
library("tidyr")
data <- read.csv('data/df_full.csv', header = TRUE)
View(data)
data <- data %>%
mutate(impact = ifelse(is.na(impact) | impact == "" | impact == "None", "unknown", impact))
View(data)
data$impact <- ifelse((is.na(impact) | impact == "" | impact == "None"), "unknown", impact)
data$impact <- ifelse((is.na(data$impact) | data$impact == "" | data$impact == "None"), "unknown", impact)
data$impact <- ifelse((is.na(data$impact) | data$impact == "" | data$impact == "None"), "unknown", data$impact)
data$impact <- ifelse(is.na(data$impact), "unknown", data$impact)
data <- data %>%
select(-impact_quartile)
data$text <- "text"
### DATA PROCESSING ###
# building a corpus using quanteda
# combine title, abstract and keywords in a column called text
data$text <- paste(data$title, data$abstract, data$keywords)
data$text <- gsub("all rights reserved"," ",as.character(data$text))
# get rid of the copyright information in our corpus
data <- data %>% separate(text, c("text","copyright"), sep = "\\(c\\)\\s*\\d+", extra="merge", remove = TRUE) #removes (c) followed by numbers - this covers most copyright messages
data$text <- gsub("all rights reserved"," ",as.character(data$text))
# create the actual CORPUS
corp <- corpus(data, text_field = "text")
summary(corp, 3)
data <- data %>%
select(-X)
# create the actual CORPUS
corp <- corpus(data, text_field = "text")
summary(corp, 3)
# building a corpus using quanteda
# combine title, abstract and keywords in a column called text
data$text <- paste(data$title, data$abstract, data$keywords)
# get rid of the copyright information in our corpus
data <- data %>% separate(text, c("text","copyright"), sep = "\\(c\\)\\s*\\d+", extra="merge", remove = TRUE) #removes (c) followed by numbers - this covers most copyright messages
data$text <- gsub("all rights reserved"," ",as.character(data$text))
# create the actual CORPUS
corp <- corpus(data, text_field = "text")
summary(corp, 3)
# the function automatically assumes all other columns contain document variables
head(docvars(corp))
# create TOKENS from this - this basically just means cutting up the text into individual words
toks <- tokens(corp)
#the below is a fairly standard list for pre-processing
toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% #create tokens w/o punctuation or symbols
tokens_tolower() %>% #lowercase -- note the pipe means this function doesn't need any input
tokens_remove(pattern = stopwords("en"), #remove stopwords
min_nchar = 2) %>% #remove short words. AMR is three letters, so best keep it to 2
tokens_wordstem(language = "en") #Using snowball stemmer
# create two dfms: one with unigrams and one with bi-grams
dfm_single = dfm(toks)%>%
dfm_trim(min_termfreq = 100, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2)
# combine
dfm = cbind(dfm_bigram, dfm_single)
dfm_bigram <- dfm(toks_bigram) %>%
dfm_trim(min_termfreq = 500, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
dfm_bigram <- dfm(toks_bigram) %>%
dfm_trim(min_termfreq = 500, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
# combine
dfm = cbind(dfm_bigram, dfm_single)
# combine
dfm = cbind(dfm_bigram, dfm_single)
``
# combine
dfm = cbind(dfm_bigram, dfm_single)
# show most-frequent tokens:
topfeatures(dfm, 15) # should include some bi-grams now
toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2)
dfm_bigram <- dfm(toks_bigram) %>%
dfm_trim(min_termfreq = 250, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
# combine
dfm = cbind(dfm_bigram, dfm_single)
# show most-frequent tokens:
topfeatures(dfm, 15) # should include some bi-grams now
toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2)
dfm_bigram <- dfm(toks_bigram) %>%
dfm_trim(min_termfreq = 100, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop")
# combine
dfm = cbind(dfm_bigram, dfm_single)
# show most-frequent tokens:
topfeatures(dfm, 15) # should include some bi-grams now
View(dfm)
storage <- stm(dfm, K=c(75,100,125), seed=53,
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
max.em.its = 100,
control = list(alpha = 0.5, #Lower than default of 50/k to allow documents to have more topics -> makes small topics more likely to show up
eta = 0.1)) #Higher than default of 0.01 to create topics composed of more words -> more clarity for complex topics
storage <- manyTopics(dfm, seed=1608,
K=c(75,100,125),
runs = 12,
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm),
max.em.its = 100,
control = list(alpha = 0.5, #Lower than default of 50/k to allow documents to have more topics -> makes small topics more likely to show up
eta = 0.1)) #Higher than default of 0.01 to create topics composed of more words -> more clarity for complex topics
model1 <- storage$out[[1]]
#You can export the whole storage, but you may want to just export the model you like best
saveRDS(model1, file = "models/170823_1.Rds")
#You can export the whole storage, but you may want to just export the model you like best
saveRDS(model1, file = "170823_1.Rds")
saveRDS(model2, file = "170823_2.Rds")
model2 <- storage$out[[2]]
saveRDS(model2, file = "170823_2.Rds")
model3 <- storage$out[[3]]
saveRDS(model3, file = "170823_3.Rds")
plot.STM(model3, type="summary", n = 5, xlim=c(0,.12), topics = climatejusticetopics3)
plot.STM(model3, type="summary", n = 5, xlim=c(0,.12))
plot.STM(model3, type="summary", n = 5, xlim=c(0,.12))
plot.STM(model2, type="summary", n = 5, xlim=c(0,.12))
# explore the outputs
# see the topic frequency with the top n keywords:
plot.STM(model1, type="summary", n = 5, xlim=c(0,.12))
# print out topics with keywords
topics <- data.frame(labelTopics(model1, c(1:model1_90$settings$dim$K), n=10)$prob)
# print out topics with keywords
topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob)
# print out topics with keywords
topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob)
# print out topics with keywords
topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob)
topics1$vocab <- "text
topics$vocab <- "text"
topics$vocab <- paste(topics$X1,topics$X2,topics$X3,topics$X4,topics$X5,topics$X6,topics$X7, topics$X8, topics$X9, topics$X10)
topics$vocab <- "text"
topics$vocab <- paste(topics$X1,topics$X2,topics$X3,topics$X4,topics$X5,topics$X6,topics$X7, topics$X8, topics$X9, topics$X10)
topics = subset(topics, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10))
View(topics)
topics2 <- data.frame(labelTopics(model2, c(1:model2$settings$dim$K), n=10)$prob)
topics2$vocab <- "text"
topics2$vocab <- paste(topics2$X1,topics2$X2,topics2$X3,topics2$X4,topics2$X5,topics2$X6,topics2$X7, topics2$X8, topics2$X9, topics2$X10)
topics2 = subset(topics2, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10))
topics3 <- data.frame(labelTopics(model3, c(1:model3$settings$dim$K), n=10)$prob)
topics3$vocab <- "text"
topics3$vocab <- paste(topics3$X1,topics3$X2,topics3$X3,topics3$X4,topics3$X5,topics3$X6,topics3$X7, topics3$X8, topics3$X9, topics3$X10)
topics3 = subset(topics3, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10))
write.csv(topics,"topics1.csv")
write.csv(topics2,"topics2.csv")
write.csv(topics3,"topics3.csv")
kresult <- searchK(dfm,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm),
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm),
max.em.its = 50)
View(dfm)
kresult <- searchK(dfm,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm),
max.em.its = 50)
kresult <- searchK(dfm,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm),
max.em.its = 50)
kresult <- searchK(dfm, seed = 2108,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm),
max.em.its = 50)
View(data)
View(dfm)
kresult <- searchK(dfm,
seed = 2108,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm))
kresult <- searchK(vocab = dfm,
seed = 2108,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm))
kresult <- searchK(documents = dfm,
vocab = dfm,
seed = 2108,
K=c(100,110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm))
kresult <- searchK(dfm,
seed = 2108,
K=c(110,120,130,140,150),
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm))
set.seed(210823)
K <- c(110,120,130,140,150)
kresult <- searchK(dfm,
K,
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm), replace = )
kresult <- searchK(dfm,
K,
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm))
kresult <- searchK(dfm, K,
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact,
data = docvars(dfm))
kresult <- searchK(dfm, K,
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact)
kresult <- searchK(dfm, K,
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact)
kresult <- searchK(documents = dfm, K,
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm))
K <- c(110,120,130,140,150)
kresult <- searchK(documents = dfm, K,
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm))
set.seed(210823)
K <- c(110,120,130,140,150)
kresult <- searchK(documents = dfm, K,
prevalence =~first_author_female + last_author_female + majority_female_binary +
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm))
Loading

0 comments on commit 3a3b30f

Please sign in to comment.