From d5a4f4eba82a942e846220079815bd4a98d9f2ee Mon Sep 17 00:00:00 2001 From: Lucky <66523959+l-ucky@users.noreply.github.com> Date: Tue, 22 Aug 2023 23:31:28 -0300 Subject: [PATCH] Create 4chan Scraper.R First version of a 4chan post scraper. It scrapes the OP post, and the few posts on the index page of any board. Enjoy data mining! --- 4chan Scraper.R | 116 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100644 4chan Scraper.R diff --git a/4chan Scraper.R b/4chan Scraper.R new file mode 100644 index 0000000..f7eadf4 --- /dev/null +++ b/4chan Scraper.R @@ -0,0 +1,116 @@ +#Load Libraries +library("rvest") +library("tidyverse") +library("ggplot2") +library("wordcloud") +library("tidytext") + +#Page 1 Scrape +pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 2 Scrape +pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 3 Scrape +pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 4 Scrape +pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 5 Scrape +pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 6 Scrape +pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 7 Scrape +pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 8 Scrape +pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 9 Scrape +pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +#Page 10 Scrape +pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>% + html_elements("blockquote.postMessage") %>% + html_text() + +# tibble makes a table out of data +df_pol <- c(pol_threads1, + pol_threads2, + pol_threads3, + pol_threads4, + pol_threads5, + pol_threads6, + pol_threads7, + pol_threads8, + pol_threads9, + pol_threads10) + +# Making a table with tibble out of the above concatenated data +pol_table <- tibble(txt = df_pol) + +tidy_pol <- pol_table %>% + unnest_tokens(word, txt, format = "text") + +# Removing stop words like "as, just, is, in," etc. +# Also removing numerical "words" that come up +tidy_pol_fixed <- tidy_pol %>% + filter(!word %in% stop_words$word + & !word == "fucking" + & !word == "https" + & !word == "shit" + & !is.numeric(word)) + +tidy_pol_fixed2 <- tidy_pol_fixed %>% + count(word, sort = TRUE) %>% + print(n = 50) + +# Time to Visualize with ggplot and wordcloud + +tidy_pol_fixed2 %>% + top_n(50) %>% + ggplot(aes(word, n)) + + geom_col() + + xlab("Words") + + ylab("Count") + + coord_flip() + +tidy_pol_fixed2 %>% + top_n(50) %>% + mutate(word = reorder(word, n)) %>% + ggplot(aes(word, n)) + + geom_col() + + xlab("Words") + + ylab("Count") + + coord_flip() + +tidy_pol_fixed2 %>% + with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.0, + colors = brewer.pal(8, "Dark2"))) + + +# Save the Data you scraped +# Make sure to change the date when saving to not overwrite the old data +# Don't save "tidy_pol_fixed2" to a csv, it only contains a fraction of the "tidy_pol_fixed" data set. +write.csv(tidy_pol_fixed, "~/Documents/Stats/4Chan Scraper/Aug-22-2023.csv", row.names=FALSE)