From d5a4f4eba82a942e846220079815bd4a98d9f2ee Mon Sep 17 00:00:00 2001
From: Lucky <66523959+l-ucky@users.noreply.github.com>
Date: Tue, 22 Aug 2023 23:31:28 -0300
Subject: [PATCH] Create 4chan Scraper.R

First version of a 4chan post scraper. It scrapes the OP post, and the few posts on the index page of any board.

Enjoy data mining!
---
 4chan Scraper.R | 116 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 4chan Scraper.R

diff --git a/4chan Scraper.R b/4chan Scraper.R
new file mode 100644
index 0000000..f7eadf4
--- /dev/null
+++ b/4chan Scraper.R	
@@ -0,0 +1,116 @@
+#Load Libraries
+library("rvest")
+library("tidyverse")
+library("ggplot2")
+library("wordcloud")
+library("tidytext")
+
+#Page 1 Scrape
+pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
+   html_elements("blockquote.postMessage") %>%
+   html_text()
+
+#Page 2 Scrape
+pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 3 Scrape
+pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 4 Scrape
+pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 5 Scrape
+pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 6 Scrape
+pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 7 Scrape
+pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 8 Scrape
+pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 9 Scrape
+pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+#Page 10 Scrape
+pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
+  html_elements("blockquote.postMessage") %>%
+  html_text()
+
+# tibble makes a table out of data
+df_pol <-      c(pol_threads1, 
+                 pol_threads2, 
+                 pol_threads3, 
+                 pol_threads4, 
+                 pol_threads5, 
+                 pol_threads6, 
+                 pol_threads7, 
+                 pol_threads8, 
+                 pol_threads9,
+                 pol_threads10)
+
+# Making a table with tibble out of the above concatenated data
+pol_table <- tibble(txt = df_pol)
+
+tidy_pol <- pol_table %>% 
+  unnest_tokens(word, txt, format = "text")
+
+# Removing stop words like "as, just, is, in," etc. 
+# Also removing numerical "words" that come up
+tidy_pol_fixed <- tidy_pol %>%
+  filter(!word %in% stop_words$word 
+         & !word == "fucking"
+         & !word == "https"
+         & !word == "shit"
+         & !is.numeric(word))
+
+tidy_pol_fixed2 <- tidy_pol_fixed %>% 
+  count(word, sort = TRUE) %>% 
+  print(n = 50)
+
+# Time to Visualize with ggplot and wordcloud
+
+tidy_pol_fixed2 %>% 
+  top_n(50) %>% 
+  ggplot(aes(word, n)) + 
+  geom_col() + 
+  xlab("Words") +
+  ylab("Count") +
+  coord_flip()
+
+tidy_pol_fixed2 %>% 
+  top_n(50) %>% 
+  mutate(word = reorder(word, n)) %>% 
+  ggplot(aes(word, n)) +
+  geom_col() + 
+  xlab("Words") +
+  ylab("Count") +
+  coord_flip()
+
+tidy_pol_fixed2 %>% 
+  with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.0, 
+                             colors = brewer.pal(8, "Dark2")))
+
+
+# Save the Data you scraped
+# Make sure to change the date when saving to not overwrite the old data
+# Don't save "tidy_pol_fixed2" to a csv, it only contains a fraction of the "tidy_pol_fixed" data set.
+write.csv(tidy_pol_fixed, "~/Documents/Stats/4Chan Scraper/Aug-22-2023.csv", row.names=FALSE)