-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
6b4ae09
commit 3a3b30f
Showing
80 changed files
with
89,981 additions
and
2,319 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,238 @@ | ||
source("~/Desktop/gender_climate_justice/code/200423_stm.R") | ||
knitr::opts_chunk$set(echo = TRUE) | ||
library("stm") | ||
library("quanteda") | ||
library("stminsights") | ||
library("tidyverse") | ||
library("readxl") | ||
library("dplyr") | ||
library("ggplot2") | ||
library("tidytext") | ||
library("gutenbergr") | ||
library("reshape2") | ||
library("gridExtra") | ||
library("forestplot") | ||
library("wesanderson") | ||
library("tidyr") | ||
data <- read.csv('data/df_full.csv', header = TRUE) | ||
View(data) | ||
data <- data %>% | ||
mutate(impact = ifelse(is.na(impact) | impact == "" | impact == "None", "unknown", impact)) | ||
View(data) | ||
data$impact <- ifelse((is.na(impact) | impact == "" | impact == "None"), "unknown", impact) | ||
data$impact <- ifelse((is.na(data$impact) | data$impact == "" | data$impact == "None"), "unknown", impact) | ||
data$impact <- ifelse((is.na(data$impact) | data$impact == "" | data$impact == "None"), "unknown", data$impact) | ||
data$impact <- ifelse(is.na(data$impact), "unknown", data$impact) | ||
data <- data %>% | ||
select(-impact_quartile) | ||
data$text <- "text" | ||
### DATA PROCESSING ### | ||
# building a corpus using quanteda | ||
# combine title, abstract and keywords in a column called text | ||
data$text <- paste(data$title, data$abstract, data$keywords) | ||
data$text <- gsub("all rights reserved"," ",as.character(data$text)) | ||
# get rid of the copyright information in our corpus | ||
data <- data %>% separate(text, c("text","copyright"), sep = "\\(c\\)\\s*\\d+", extra="merge", remove = TRUE) #removes (c) followed by numbers - this covers most copyright messages | ||
data$text <- gsub("all rights reserved"," ",as.character(data$text)) | ||
# create the actual CORPUS | ||
corp <- corpus(data, text_field = "text") | ||
summary(corp, 3) | ||
data <- data %>% | ||
select(-X) | ||
# create the actual CORPUS | ||
corp <- corpus(data, text_field = "text") | ||
summary(corp, 3) | ||
# building a corpus using quanteda | ||
# combine title, abstract and keywords in a column called text | ||
data$text <- paste(data$title, data$abstract, data$keywords) | ||
# get rid of the copyright information in our corpus | ||
data <- data %>% separate(text, c("text","copyright"), sep = "\\(c\\)\\s*\\d+", extra="merge", remove = TRUE) #removes (c) followed by numbers - this covers most copyright messages | ||
data$text <- gsub("all rights reserved"," ",as.character(data$text)) | ||
# create the actual CORPUS | ||
corp <- corpus(data, text_field = "text") | ||
summary(corp, 3) | ||
# the function automatically assumes all other columns contain document variables | ||
head(docvars(corp)) | ||
# create TOKENS from this - this basically just means cutting up the text into individual words | ||
toks <- tokens(corp) | ||
#the below is a fairly standard list for pre-processing | ||
toks <- tokens(corp, remove_punct = TRUE, remove_symbols = TRUE) %>% #create tokens w/o punctuation or symbols | ||
tokens_tolower() %>% #lowercase -- note the pipe means this function doesn't need any input | ||
tokens_remove(pattern = stopwords("en"), #remove stopwords | ||
min_nchar = 2) %>% #remove short words. AMR is three letters, so best keep it to 2 | ||
tokens_wordstem(language = "en") #Using snowball stemmer | ||
# create two dfms: one with unigrams and one with bi-grams | ||
dfm_single = dfm(toks)%>% | ||
dfm_trim(min_termfreq = 100, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop") | ||
toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2) | ||
# combine | ||
dfm = cbind(dfm_bigram, dfm_single) | ||
dfm_bigram <- dfm(toks_bigram) %>% | ||
dfm_trim(min_termfreq = 500, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop") | ||
dfm_bigram <- dfm(toks_bigram) %>% | ||
dfm_trim(min_termfreq = 500, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop") | ||
# combine | ||
dfm = cbind(dfm_bigram, dfm_single) | ||
# combine | ||
dfm = cbind(dfm_bigram, dfm_single) | ||
`` | ||
# combine | ||
dfm = cbind(dfm_bigram, dfm_single) | ||
# show most-frequent tokens: | ||
topfeatures(dfm, 15) # should include some bi-grams now | ||
toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2) | ||
dfm_bigram <- dfm(toks_bigram) %>% | ||
dfm_trim(min_termfreq = 250, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop") | ||
# combine | ||
dfm = cbind(dfm_bigram, dfm_single) | ||
# show most-frequent tokens: | ||
topfeatures(dfm, 15) # should include some bi-grams now | ||
toks_bigram <- tokens_ngrams(toks, n = 2, skip = 1:2) | ||
dfm_bigram <- dfm(toks_bigram) %>% | ||
dfm_trim(min_termfreq = 100, termfreq_type = "count", max_docfreq=0.95, docfreq_type = "prop") | ||
# combine | ||
dfm = cbind(dfm_bigram, dfm_single) | ||
# show most-frequent tokens: | ||
topfeatures(dfm, 15) # should include some bi-grams now | ||
View(dfm) | ||
storage <- stm(dfm, K=c(75,100,125), seed=53, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
max.em.its = 100, | ||
control = list(alpha = 0.5, #Lower than default of 50/k to allow documents to have more topics -> makes small topics more likely to show up | ||
eta = 0.1)) #Higher than default of 0.01 to create topics composed of more words -> more clarity for complex topics | ||
storage <- manyTopics(dfm, seed=1608, | ||
K=c(75,100,125), | ||
runs = 12, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), | ||
max.em.its = 100, | ||
control = list(alpha = 0.5, #Lower than default of 50/k to allow documents to have more topics -> makes small topics more likely to show up | ||
eta = 0.1)) #Higher than default of 0.01 to create topics composed of more words -> more clarity for complex topics | ||
model1 <- storage$out[[1]] | ||
#You can export the whole storage, but you may want to just export the model you like best | ||
saveRDS(model1, file = "models/170823_1.Rds") | ||
#You can export the whole storage, but you may want to just export the model you like best | ||
saveRDS(model1, file = "170823_1.Rds") | ||
saveRDS(model2, file = "170823_2.Rds") | ||
model2 <- storage$out[[2]] | ||
saveRDS(model2, file = "170823_2.Rds") | ||
model3 <- storage$out[[3]] | ||
saveRDS(model3, file = "170823_3.Rds") | ||
plot.STM(model3, type="summary", n = 5, xlim=c(0,.12), topics = climatejusticetopics3) | ||
plot.STM(model3, type="summary", n = 5, xlim=c(0,.12)) | ||
plot.STM(model3, type="summary", n = 5, xlim=c(0,.12)) | ||
plot.STM(model2, type="summary", n = 5, xlim=c(0,.12)) | ||
# explore the outputs | ||
# see the topic frequency with the top n keywords: | ||
plot.STM(model1, type="summary", n = 5, xlim=c(0,.12)) | ||
# print out topics with keywords | ||
topics <- data.frame(labelTopics(model1, c(1:model1_90$settings$dim$K), n=10)$prob) | ||
# print out topics with keywords | ||
topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob) | ||
# print out topics with keywords | ||
topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob) | ||
# print out topics with keywords | ||
topics <- data.frame(labelTopics(model1, c(1:model1$settings$dim$K), n=10)$prob) | ||
topics1$vocab <- "text | ||
topics$vocab <- "text" | ||
topics$vocab <- paste(topics$X1,topics$X2,topics$X3,topics$X4,topics$X5,topics$X6,topics$X7, topics$X8, topics$X9, topics$X10) | ||
topics$vocab <- "text" | ||
topics$vocab <- paste(topics$X1,topics$X2,topics$X3,topics$X4,topics$X5,topics$X6,topics$X7, topics$X8, topics$X9, topics$X10) | ||
topics = subset(topics, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10)) | ||
View(topics) | ||
topics2 <- data.frame(labelTopics(model2, c(1:model2$settings$dim$K), n=10)$prob) | ||
topics2$vocab <- "text" | ||
topics2$vocab <- paste(topics2$X1,topics2$X2,topics2$X3,topics2$X4,topics2$X5,topics2$X6,topics2$X7, topics2$X8, topics2$X9, topics2$X10) | ||
topics2 = subset(topics2, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10)) | ||
topics3 <- data.frame(labelTopics(model3, c(1:model3$settings$dim$K), n=10)$prob) | ||
topics3$vocab <- "text" | ||
topics3$vocab <- paste(topics3$X1,topics3$X2,topics3$X3,topics3$X4,topics3$X5,topics3$X6,topics3$X7, topics3$X8, topics3$X9, topics3$X10) | ||
topics3 = subset(topics3, select = -c(X1, X2, X3, X4, X5, X6, X7, X8, X9, X10)) | ||
write.csv(topics,"topics1.csv") | ||
write.csv(topics2,"topics2.csv") | ||
write.csv(topics3,"topics3.csv") | ||
kresult <- searchK(dfm, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), | ||
max.em.its = 50) | ||
View(dfm) | ||
kresult <- searchK(dfm, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), | ||
max.em.its = 50) | ||
kresult <- searchK(dfm, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), | ||
max.em.its = 50) | ||
kresult <- searchK(dfm, seed = 2108, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), | ||
max.em.its = 50) | ||
View(data) | ||
View(dfm) | ||
kresult <- searchK(dfm, | ||
seed = 2108, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm)) | ||
kresult <- searchK(vocab = dfm, | ||
seed = 2108, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm)) | ||
kresult <- searchK(documents = dfm, | ||
vocab = dfm, | ||
seed = 2108, | ||
K=c(100,110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm)) | ||
kresult <- searchK(dfm, | ||
seed = 2108, | ||
K=c(110,120,130,140,150), | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm)) | ||
set.seed(210823) | ||
K <- c(110,120,130,140,150) | ||
kresult <- searchK(dfm, | ||
K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm), replace = ) | ||
kresult <- searchK(dfm, | ||
K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm)) | ||
kresult <- searchK(dfm, K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, | ||
data = docvars(dfm)) | ||
kresult <- searchK(dfm, K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact) | ||
kresult <- searchK(dfm, K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact) | ||
kresult <- searchK(documents = dfm, K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm)) | ||
K <- c(110,120,130,140,150) | ||
kresult <- searchK(documents = dfm, K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm)) | ||
set.seed(210823) | ||
K <- c(110,120,130,140,150) | ||
kresult <- searchK(documents = dfm, K, | ||
prevalence =~first_author_female + last_author_female + majority_female_binary + | ||
s(year) + subfield + X1_gii_quartile + X2_gii_quartile + impact, data = docvars(dfm)) |
Oops, something went wrong.