4Chan-Web-Scraper/4chan Scraper.R

117 lines
3.2 KiB
R

#Load Libraries
library("rvest")
library("tidyverse")
library("ggplot2")
library("wordcloud")
library("tidytext")
#Page 1 Scrape
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 2 Scrape
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 3 Scrape
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 4 Scrape
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 5 Scrape
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 6 Scrape
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 7 Scrape
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 8 Scrape
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 9 Scrape
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
html_elements("blockquote.postMessage") %>%
html_text()
#Page 10 Scrape
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
html_elements("blockquote.postMessage") %>%
html_text()
# tibble makes a table out of data
df_pol <- c(pol_threads1,
pol_threads2,
pol_threads3,
pol_threads4,
pol_threads5,
pol_threads6,
pol_threads7,
pol_threads8,
pol_threads9,
pol_threads10)
# Making a table with tibble out of the above concatenated data
pol_table <- tibble(txt = df_pol)
tidy_pol <- pol_table %>%
unnest_tokens(word, txt, format = "text")
# Removing stop words like "as, just, is, in," etc.
# Also removing numerical "words" that come up
tidy_pol_fixed <- tidy_pol %>%
filter(!word %in% stop_words$word
& !word == "fucking"
& !word == "https"
& !word == "shit"
& !is.numeric(word))
tidy_pol_fixed2 <- tidy_pol_fixed %>%
count(word, sort = TRUE) %>%
print(n = 50)
# Time to Visualize with ggplot and wordcloud
tidy_pol_fixed2 %>%
top_n(50) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab("Words") +
ylab("Count") +
coord_flip()
tidy_pol_fixed2 %>%
top_n(50) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab("Words") +
ylab("Count") +
coord_flip()
tidy_pol_fixed2 %>%
with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.0,
colors = brewer.pal(8, "Dark2")))
# Save the Data you scraped
# Make sure to change the date when saving to not overwrite the old data
# Don't save "tidy_pol_fixed2" to a csv, it only contains a fraction of the "tidy_pol_fixed" data set.
write.csv(tidy_pol_fixed, "~/Documents/Stats/4Chan Scraper/Aug-22-2023.csv", row.names=FALSE)