170 lines
4.7 KiB
R
170 lines
4.7 KiB
R
# Load Libraries
|
|
library("rvest")
|
|
library("tidyverse")
|
|
library("ggplot2")
|
|
library("wordcloud")
|
|
library("tidytext")
|
|
library("tinytex")
|
|
library("syuzhet")
|
|
library("lubridate")
|
|
library("scales")
|
|
library("reshape2")
|
|
library("dplyr")
|
|
|
|
# This scraping is getting all of the internal links
|
|
|
|
#Page 1 Scrape
|
|
pol_threads1 <- read_html("https://boards.4channel.org/pol/") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 2 Scrape
|
|
pol_threads2 <- read_html("https://boards.4channel.org/pol/2") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 3 Scrape
|
|
pol_threads3 <- read_html("https://boards.4channel.org/pol/3") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 4 Scrape
|
|
pol_threads4 <- read_html("https://boards.4channel.org/pol/4") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
|
|
#Page 5 Scrape
|
|
pol_threads5 <- read_html("https://boards.4channel.org/pol/5") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 6 Scrape
|
|
pol_threads6 <- read_html("https://boards.4channel.org/pol/6") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 7 Scrape
|
|
pol_threads7 <- read_html("https://boards.4channel.org/pol/7") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 8 Scrape
|
|
pol_threads8 <- read_html("https://boards.4channel.org/pol/8") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 9 Scrape
|
|
pol_threads9 <- read_html("https://boards.4channel.org/pol/9") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
#Page 10 Scrape
|
|
pol_threads10 <- read_html("https://boards.4channel.org/pol/10") %>%
|
|
html_elements("a") %>%
|
|
html_attr('href')
|
|
|
|
# Combining all of the threads into 1 Vector
|
|
df_pol <- c(pol_threads1,
|
|
pol_threads2,
|
|
pol_threads3,
|
|
pol_threads4,
|
|
pol_threads5,
|
|
pol_threads6,
|
|
pol_threads7,
|
|
pol_threads8,
|
|
pol_threads9,
|
|
pol_threads10)
|
|
|
|
#tibble makes a table out of data from the scraped links
|
|
pol_table <- tibble(txt = df_pol)
|
|
|
|
# Choosing all of the links that look like: "4chan.org/pol/thread/this-is-a-thread/"
|
|
df_links <- pol_table %>%
|
|
filter(str_detect(txt, "(thread/[0-9]{6,}[/][a-z]{1,})"))
|
|
|
|
# Next step is appending on "https://boards.4chan.org/pol/" before the "thread/this-is-a-thread".
|
|
df_links$txt <- paste("https://boards.4chan.org/pol/", df_links$txt, sep = "")
|
|
|
|
# This code will "apply" the
|
|
# "read_html"
|
|
# "html_elements" and
|
|
# "html_attr"
|
|
# to each row in the data frame
|
|
threads <- lapply(df_links$txt, function(x) {
|
|
read_html(x) %>%
|
|
html_elements("span") %>%
|
|
html_attr("class")})
|
|
|
|
# This gets all class elements from HTML and puts them under 1 column
|
|
threads_rbind <- do.call(rbind.data.frame, threads)
|
|
threads_col <- tibble(txt = threads_rbind)
|
|
threads_col_all <-data.frame(txt = c(t(threads_col)), stringsAsFactors=FALSE)
|
|
|
|
# Break up all of the sentences into single words
|
|
# Also filters everything except: example -> "id_A1b2c45d"
|
|
tidy_pol_IDs <- threads_col_all %>%
|
|
unnest_tokens(word, txt, format = "text") %>%
|
|
filter(str_detect(word, "(id_[a-zA-z0-9]{8})"))
|
|
|
|
|
|
# === Import CSV as tidy_pol_IDs ===
|
|
# tidy_pol_IDs <- read.csv("~/Documents/Stats/4ChanScraper/Aug 28 2023 06:01:27 PM.csv")
|
|
|
|
# Shows the top 10 IDs, and displays number
|
|
# of their posts for all threads.
|
|
tidy_pol_sorted_IDs <- tidy_pol_IDs %>%
|
|
count(word, sort = TRUE)
|
|
|
|
# Useless code, but shows the bottom 10 IDs
|
|
# A few hundred posters only post twice.
|
|
tidy_pol_bottom_10_IDs <- tidy_pol_IDs %>%
|
|
count(word, sort = TRUE) %>%
|
|
tail(50)
|
|
|
|
# Counts all occurances of the unique ID showing up in threads
|
|
count_by_IDs <- tidy_pol_IDs %>%
|
|
count(word, sort = TRUE) %>%
|
|
tail(50)
|
|
|
|
# Combines all counts of each poster's post
|
|
# e.g., Unique IDs posted twice,
|
|
# for a total of 500 times across all threads
|
|
# i.e. 2bptid
|
|
count_by_n <- tidy_pol_sorted_IDs %>%
|
|
count(n, sort = TRUE)
|
|
|
|
|
|
# =========== Time to Visualize ===========
|
|
|
|
# Plots the biggest contributers for posts
|
|
# in all threads
|
|
tidy_pol_sorted_IDs %>%
|
|
top_n(30) %>%
|
|
mutate(word = reorder(word, n)) %>%
|
|
ggplot(aes(word, n)) +
|
|
geom_col(fill="darkblue") +
|
|
xlab("Words") +
|
|
ylab("Count") +
|
|
coord_flip() +
|
|
theme_dark()
|
|
|
|
# Plots total observation for X posts by this ID
|
|
# Example:
|
|
# plots X number of observations for 2 posts by this ID
|
|
# Then it orders the top 20 frequencies starting from zero.
|
|
count_by_n %>%
|
|
top_n(20) %>%
|
|
mutate(reorder(n, nn)) %>%
|
|
ggplot(aes(n, nn)) +
|
|
geom_col(fill = "darkblue") +
|
|
xlab("Number of Posts per ID") +
|
|
ylab("Observed Counts of X posts") +
|
|
coord_flip() +
|
|
theme_dark()
|
|
|
|
# Time to Save the Data
|
|
timestamp <- format(Sys.time(), "%b %d %Y %X")
|
|
filename <- paste0("~/Documents/Stats/4ChanScraper/Pol-IDs-",timestamp,".csv")
|
|
write.csv(tidy_pol_IDs, file = filename)
|