Update 4chan pol ngram Scraper v2.R
This commit is contained in:
parent
fd3537c7a6
commit
601ef7d30c
|
|
@ -111,9 +111,22 @@ threads_tibble <- tibble(txt = threads)
|
||||||
tidy_pol <- threads_tibble %>%
|
tidy_pol <- threads_tibble %>%
|
||||||
unnest_tokens(word, txt, format = "text", token = "ngrams", n = 2, to_lower = TRUE)
|
unnest_tokens(word, txt, format = "text", token = "ngrams", n = 2, to_lower = TRUE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
tidy_pol_fixed <- tidy_pol %>%
|
tidy_pol_fixed <- tidy_pol %>%
|
||||||
filter(str_detect(word, "([a-z]{3,} [a-z]{3,})"))
|
filter(str_detect(word, "([a-z]{3,} [a-z]{3,})"))
|
||||||
|
|
||||||
|
# Failures
|
||||||
|
# tidy_pol_fixed2 <- tidy_pol_fixed %>%
|
||||||
|
# filter(str_detect(word, "[_]{1,}"))
|
||||||
|
# tidy_pol_fixed <- tidy_pol %>%
|
||||||
|
# filter(str_detect(word, "([\\w\\S_])"))
|
||||||
|
# tidy_pol_fixed <- tidy_pol %>%
|
||||||
|
# filter(!grepl("[a-z] [a-z]", word))
|
||||||
|
# tidy_pol_fixed <- tidy_pol %>%
|
||||||
|
# filter(str_detect(word, "([\\d-] [\\S-])"))
|
||||||
|
|
||||||
|
|
||||||
tidy_pol_fixed_separated <- tidy_pol_fixed %>%
|
tidy_pol_fixed_separated <- tidy_pol_fixed %>%
|
||||||
separate(word, into = c("word1", "word2"), sep = " ") %>%
|
separate(word, into = c("word1", "word2"), sep = " ") %>%
|
||||||
|
|
@ -595,6 +608,8 @@ tidy_pol_united_ngram<- tidy_pol_fixed_separated %>%
|
||||||
word2 %in% GradyAugmented) %>%
|
word2 %in% GradyAugmented) %>%
|
||||||
unite(word, c(word1, word2), sep = " ")
|
unite(word, c(word1, word2), sep = " ")
|
||||||
|
|
||||||
|
# Add date
|
||||||
|
date <- Sys.Date()
|
||||||
|
|
||||||
tidy_pol_fixed2_ngram <- tidy_pol_united_ngram %>%
|
tidy_pol_fixed2_ngram <- tidy_pol_united_ngram %>%
|
||||||
count(word, sort = TRUE) %>%
|
count(word, sort = TRUE) %>%
|
||||||
|
|
@ -602,16 +617,18 @@ tidy_pol_fixed2_ngram <- tidy_pol_united_ngram %>%
|
||||||
& !word == "nigger nigger"
|
& !word == "nigger nigger"
|
||||||
& !word == "based based"
|
& !word == "based based"
|
||||||
& !word == "jew jew"
|
& !word == "jew jew"
|
||||||
& !word == "niggers niggers") %>%
|
& !word == "niggers niggers"
|
||||||
|
& !word == "baking baking") %>%
|
||||||
|
cbind(date) %>%
|
||||||
print(n=70)
|
print(n=70)
|
||||||
|
|
||||||
|
|
||||||
# =========== Time to Visualize ===========
|
# =========== Time to Visualize ===========
|
||||||
#tidy_pol_fixed2 <- read.csv("~/Documents/Stats/4ChanScraper/ngram Sep 01 2023 22:26:23.csv" )
|
#tidy_pol_fixed2_ngram <- read.csv("~/Documents/Stats/4ChanScraper/csv/ngram Sep 01 2023 22:26:23.csv")
|
||||||
|
|
||||||
|
|
||||||
tidy_pol_fixed2_ngram %>%
|
tidy_pol_fixed2_ngram %>%
|
||||||
top_n(70) %>%
|
top_n(50) %>%
|
||||||
mutate(word = reorder(word, n)) %>%
|
mutate(word = reorder(word, n)) %>%
|
||||||
ggplot(aes(word, n, fill = n)) +
|
ggplot(aes(word, n, fill = n)) +
|
||||||
geom_bar(stat = "identity") +
|
geom_bar(stat = "identity") +
|
||||||
|
|
@ -625,10 +642,10 @@ tidy_pol_fixed2_ngram %>%
|
||||||
|
|
||||||
|
|
||||||
tidy_pol_fixed2_ngram %>%
|
tidy_pol_fixed2_ngram %>%
|
||||||
with(wordcloud(word, n, max.words = 75, scale = c(1.5,0.75), random.order = FALSE, rot.per = 0.0,
|
with(wordcloud(word, n, max.words = 100, scale = c(2,1), random.order = FALSE, rot.per = 0.0,
|
||||||
colors = brewer.pal(8, "Dark2")))
|
colors = brewer.pal(8, "Dark2")))
|
||||||
|
|
||||||
# Time to Save the Data
|
# Time to Save the Data
|
||||||
timestamp <- format(Sys.time(), "%b %d %Y %X")
|
timestamp <- format(Sys.time(), "%b %d %Y %X")
|
||||||
filename <- paste0("~/Documents/Stats/4ChanScraper/ngram ",timestamp,".csv")
|
filename <- paste0("~/Documents/Stats/4ChanScraper/csv/ngram ",timestamp,".csv")
|
||||||
write.csv(tidy_pol_fixed2_ngram, file = filename)
|
write.csv(tidy_pol_fixed, file = filename)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue