117 lines
3.2 KiB
R
117 lines
3.2 KiB
R
#Load Libraries
|
|
library("rvest")
|
|
library("tidyverse")
|
|
library("ggplot2")
|
|
library("wordcloud")
|
|
library("tidytext")
|
|
|
|
#Page 1 Scrape
|
|
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 2 Scrape
|
|
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 3 Scrape
|
|
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 4 Scrape
|
|
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 5 Scrape
|
|
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 6 Scrape
|
|
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 7 Scrape
|
|
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 8 Scrape
|
|
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 9 Scrape
|
|
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
#Page 10 Scrape
|
|
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
|
|
html_elements("blockquote.postMessage") %>%
|
|
html_text()
|
|
|
|
# tibble makes a table out of data
|
|
df_pol <- c(pol_threads1,
|
|
pol_threads2,
|
|
pol_threads3,
|
|
pol_threads4,
|
|
pol_threads5,
|
|
pol_threads6,
|
|
pol_threads7,
|
|
pol_threads8,
|
|
pol_threads9,
|
|
pol_threads10)
|
|
|
|
# Making a table with tibble out of the above concatenated data
|
|
pol_table <- tibble(txt = df_pol)
|
|
|
|
tidy_pol <- pol_table %>%
|
|
unnest_tokens(word, txt, format = "text")
|
|
|
|
# Removing stop words like "as, just, is, in," etc.
|
|
# Also removing numerical "words" that come up
|
|
tidy_pol_fixed <- tidy_pol %>%
|
|
filter(!word %in% stop_words$word
|
|
& !word == "fucking"
|
|
& !word == "https"
|
|
& !word == "shit"
|
|
& !is.numeric(word))
|
|
|
|
tidy_pol_fixed2 <- tidy_pol_fixed %>%
|
|
count(word, sort = TRUE) %>%
|
|
print(n = 50)
|
|
|
|
# Time to Visualize with ggplot and wordcloud
|
|
|
|
tidy_pol_fixed2 %>%
|
|
top_n(50) %>%
|
|
ggplot(aes(word, n)) +
|
|
geom_col() +
|
|
xlab("Words") +
|
|
ylab("Count") +
|
|
coord_flip()
|
|
|
|
tidy_pol_fixed2 %>%
|
|
top_n(50) %>%
|
|
mutate(word = reorder(word, n)) %>%
|
|
ggplot(aes(word, n)) +
|
|
geom_col() +
|
|
xlab("Words") +
|
|
ylab("Count") +
|
|
coord_flip()
|
|
|
|
tidy_pol_fixed2 %>%
|
|
with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.0,
|
|
colors = brewer.pal(8, "Dark2")))
|
|
|
|
|
|
# Save the Data you scraped
|
|
# Make sure to change the date when saving to not overwrite the old data
|
|
# Don't save "tidy_pol_fixed2" to a csv, it only contains a fraction of the "tidy_pol_fixed" data set.
|
|
write.csv(tidy_pol_fixed, "~/Documents/Stats/4Chan Scraper/Aug-22-2023.csv", row.names=FALSE)
|