Update 4chan pol ngram Scraper v2.R

2023-11-30 22:57:10 -04:00 · 2023-11-30 22:57:10 -04:00 · 601ef7d30c
parent fd3537c7a6
commit 601ef7d30c
1 changed files with 23 additions and 6 deletions
--- a/scripts/4chan
+++ b/scripts/4chan
@ -111,9 +111,22 @@ threads_tibble <- tibble(txt = threads)
 tidy_pol <- threads_tibble %>% 
  unnest_tokens(word, txt, format = "text", token = "ngrams", n = 2, to_lower = TRUE)
 tidy_pol_fixed <- tidy_pol %>%
  filter(str_detect(word, "([a-z]{3,} [a-z]{3,})"))
 # Failures
 # tidy_pol_fixed2 <- tidy_pol_fixed %>% 
 #   filter(str_detect(word, "[_]{1,}"))
 # tidy_pol_fixed <- tidy_pol %>%
 #   filter(str_detect(word, "([\\w\\S_])"))
 # tidy_pol_fixed <- tidy_pol %>%
 #   filter(!grepl("[a-z] [a-z]", word))
 # tidy_pol_fixed <- tidy_pol %>%
 #   filter(str_detect(word, "([\\d-] [\\S-])"))
 tidy_pol_fixed_separated <- tidy_pol_fixed %>%  
  separate(word, into = c("word1", "word2"), sep = " ") %>% 
@ -595,6 +608,8 @@ tidy_pol_united_ngram<- tidy_pol_fixed_separated %>%
         word2 %in% GradyAugmented) %>%
  unite(word, c(word1, word2), sep = " ")
 # Add date
 date <- Sys.Date()
 tidy_pol_fixed2_ngram <- tidy_pol_united_ngram %>% 
  count(word, sort = TRUE) %>% 
@ -602,16 +617,18 @@ tidy_pol_fixed2_ngram <- tidy_pol_united_ngram %>%
         & !word == "nigger nigger"
         & !word == "based based"
         & !word == "jew jew"
-         & !word == "niggers niggers") %>% 
+         & !word == "niggers niggers"
         & !word == "baking baking") %>% 
  cbind(date) %>% 
  print(n=70)
 # =========== Time to Visualize ===========
-#tidy_pol_fixed2 <- read.csv("~/Documents/Stats/4ChanScraper/ngram Sep 01 2023 22:26:23.csv"  )
+#tidy_pol_fixed2_ngram <- read.csv("~/Documents/Stats/4ChanScraper/csv/ngram Sep 01 2023 22:26:23.csv")
 tidy_pol_fixed2_ngram %>% 
-  top_n(70) %>% 
+  top_n(50) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n, fill = n)) +
  geom_bar(stat = "identity") +
@ -625,10 +642,10 @@ tidy_pol_fixed2_ngram %>%
 tidy_pol_fixed2_ngram %>% 
-  with(wordcloud(word, n, max.words = 75, scale = c(1.5,0.75), random.order = FALSE, rot.per = 0.0, 
+  with(wordcloud(word, n, max.words = 100, scale = c(2,1), random.order = FALSE, rot.per = 0.0, 
                 colors = brewer.pal(8, "Dark2")))
 # Time to Save the Data
 timestamp <- format(Sys.time(), "%b %d %Y %X")
-filename <- paste0("~/Documents/Stats/4ChanScraper/ngram ",timestamp,".csv")  
+filename <- paste0("~/Documents/Stats/4ChanScraper/csv/ngram ",timestamp,".csv")  
-write.csv(tidy_pol_fixed2_ngram, file = filename)
+write.csv(tidy_pol_fixed, file = filename)