4Chan-Web-Scraper/4chan Scraper.R

#Load Libraries
library("rvest")
library("tidyverse")
library("ggplot2")
library("wordcloud")
library("tidytext")

#Page 1 Scrape
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
   html_elements("blockquote.postMessage") %>%
   html_text()

#Page 2 Scrape
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 3 Scrape
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 4 Scrape
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 5 Scrape
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 6 Scrape
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 7 Scrape
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 8 Scrape
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 9 Scrape
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

#Page 10 Scrape
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
  html_elements("blockquote.postMessage") %>%
  html_text()

# tibble makes a table out of data
df_pol <-      c(pol_threads1,
                 pol_threads2,
                 pol_threads3,
                 pol_threads4,
                 pol_threads5,
                 pol_threads6,
                 pol_threads7,
                 pol_threads8,
                 pol_threads9,
                 pol_threads10)

# Making a table with tibble out of the above concatenated data
pol_table <- tibble(txt = df_pol)

tidy_pol <- pol_table %>%
  unnest_tokens(word, txt, format = "text")

# Removing stop words like "as, just, is, in," etc.
# Also removing numerical "words" that come up
tidy_pol_fixed <- tidy_pol %>%
  filter(!word %in% stop_words$word
         & !word == "fucking"
         & !word == "https"
         & !word == "shit"
         & !is.numeric(word))

tidy_pol_fixed2 <- tidy_pol_fixed %>%
  count(word, sort = TRUE) %>%
  print(n = 50)

# Time to Visualize with ggplot and wordcloud

tidy_pol_fixed2 %>%
  top_n(50) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab("Words") +
  ylab("Count") +
  coord_flip()

tidy_pol_fixed2 %>%
  top_n(50) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n)) +
  geom_col() +
  xlab("Words") +
  ylab("Count") +
  coord_flip()

tidy_pol_fixed2 %>%
  with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.0,
                             colors = brewer.pal(8, "Dark2")))


# Save the Data you scraped
# Make sure to change the date when saving to not overwrite the old data
# Don't save "tidy_pol_fixed2" to a csv, it only contains a fraction of the "tidy_pol_fixed" data set.
write.csv(tidy_pol_fixed, "~/Documents/Stats/4Chan Scraper/Aug-22-2023.csv", row.names=FALSE)